TF-IDF 酒店评论关键词提取:80家三星级酒店评论分析
这段代码是对所有酒店的评论一起进行分析。可以将每个酒店的评论分别提取出来,分别进行处理,得到每个酒店的关键词。
import pymongo
import nltk
import functools
import math
from bson.objectid import ObjectId
from bson import json_util as jsonb
from nltk import word_tokenize,pos_tag
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.dump
# #输出评论部分的数据
#分词
cutword1 = word_tokenize(results)
# 去除标点符号
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great'] #定义符号列表
cutwords2 = [word for word in cutword1 if word not in interpunctuations]
#判断分词在不在停用词列表内
stops = set(stopwords.words('english'))
cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]
tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
pos_tags =nltk.pos_tag(cutword3)
cutword4 = []
for word,pos in pos_tags:
if (pos in tags):
cutword4.append(word)
#return ' '.join(ret)
doc_list = []
for cutword in cutword4:
doc_list.append(PorterStemmer().stem(cutword)) #词干提取
#创建TXT文档
#f=open ('C:/users/Dell/Desktop/doc_list.txt', 'a+')
# idf值统计方法
def train_idf(doc_list):
idf_dic = {}
# 总文档数
tt_count = len(doc_list)
# 每个词出现的文档数
for doc in doc_list:
for word in set(doc):
idf_dic[word] = idf_dic.get(word, 0.0) + 1.0
# 按公式转换为idf值,分母加1进行平滑处理
for k, v in idf_dic.items():
idf_dic[k] = math.log(tt_count / (1.0 + v))
# 对于没有在字典中的词,默认其仅在一个文档出现,得到默认idf值
default_idf = math.log(tt_count / (1.0))
return idf_dic, default_idf
# 排序函数,用于topK关键词的按值排序
def cmp(e1, e2):
import numpy as np
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1
# TF-IDF
class TfIdf(object):
# 四个参数分别是:训练好的idf字典,默认idf值,处理后的待提取文本,关键词数量
def __init__(self, idf_dic, default_idf, doc_list, keyword_num):
self.doc_list = doc_list
self.idf_dic, self.default_idf = idf_dic, default_idf
self.tf_dic = self.get_tf_dic()
self.keyword_num = keyword_num
# 统计tf值
def get_tf_dic(self):
tf_dic = {}
for word in self.doc_list:
tf_dic[word] = tf_dic.get(word, 0.0) + 1.0
tt_count = len(self.doc_list)
for k, v in tf_dic.items():
tf_dic[k] = float(v) / tt_count
return tf_dic
# 按公式计算tf-idf
def get_tfidf(self):
tfidf_dic = {}
for word in self.doc_list:
idf = self.idf_dic.get(word, self.default_idf)
tf = self.tf_dic.get(word, 0)
tfidf = tf * idf
tfidf_dic[word] = tfidf
tfidf_dic.items()
# 根据tf-idf排序,去排名前keyword_num的词作为关键词
for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
print(k + '/ ', end='')
print(v)
print()
def tfidf_extract(doc_list, pos=False, keyword_num=25):
# doc_list = load_data(pos)
idf_dic, default_idf = train_idf(doc_list)
tfidf_model = TfIdf(idf_dic, default_idf, doc_list, keyword_num)
tfidf_model.get_tfidf()
if __name__ == '__main__':
print('TF-IDF模型结果:')
tfidf_extract(doc_list)
要对每个酒店进行分析,可以将代码修改成以下形式:
import pymongo
import nltk
import functools
import math
from bson.objectid import ObjectId
from bson import json_util as jsonb
from nltk import word_tokenize,pos_tag
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.dump
# 获取所有酒店的评论
hotels = db.hotels.find()
for hotel in hotels:
# 提取每个酒店的评论
reviews = hotel['reviews']
# 对每个酒店的评论进行分析
doc_list = []
for review in reviews:
# 分词
cutword1 = word_tokenize(review)
# 去除标点符号
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great']
cutwords2 = [word for word in cutword1 if word not in interpunctuations]
# 判断分词在不在停用词列表内
stops = set(stopwords.words('english'))
cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]
# 词性标注
tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
pos_tags = nltk.pos_tag(cutword3)
cutword4 = [word for word, pos in pos_tags if pos in tags]
# 词干提取
for cutword in cutword4:
doc_list.append(PorterStemmer().stem(cutword))
# 训练IDF模型
idf_dic, default_idf = train_idf(doc_list)
# 创建TF-IDF模型
tfidf_model = TfIdf(idf_dic, default_idf, doc_list, 25)
# 提取关键词
print(f'酒店 {hotel['name']} 的关键词:')
tfidf_model.get_tfidf()
# idf值统计方法
def train_idf(doc_list):
idf_dic = {}
# 总文档数
tt_count = len(doc_list)
# 每个词出现的文档数
for doc in doc_list:
for word in set(doc):
idf_dic[word] = idf_dic.get(word, 0.0) + 1.0
# 按公式转换为idf值,分母加1进行平滑处理
for k, v in idf_dic.items():
idf_dic[k] = math.log(tt_count / (1.0 + v))
# 对于没有在字典中的词,默认其仅在一个文档出现,得到默认idf值
default_idf = math.log(tt_count / (1.0))
return idf_dic, default_idf
# 排序函数,用于topK关键词的按值排序
def cmp(e1, e2):
import numpy as np
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1
# TF-IDF
class TfIdf(object):
# 四个参数分别是:训练好的idf字典,默认idf值,处理后的待提取文本,关键词数量
def __init__(self, idf_dic, default_idf, doc_list, keyword_num):
self.doc_list = doc_list
self.idf_dic, self.default_idf = idf_dic, default_idf
self.tf_dic = self.get_tf_dic()
self.keyword_num = keyword_num
# 统计tf值
def get_tf_dic(self):
tf_dic = {}
for word in self.doc_list:
tf_dic[word] = tf_dic.get(word, 0.0) + 1.0
tt_count = len(self.doc_list)
for k, v in tf_dic.items():
tf_dic[k] = float(v) / tt_count
return tf_dic
# 按公式计算tf-idf
def get_tfidf(self):
tfidf_dic = {}
for word in self.doc_list:
idf = self.idf_dic.get(word, self.default_idf)
tf = self.tf_dic.get(word, 0)
tfidf = tf * idf
tfidf_dic[word] = tfidf
tfidf_dic.items()
# 根据tf-idf排序,去排名前keyword_num的词作为关键词
for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
print(k + '/ ', end='')
print(v)
print()
这段代码会分别对每个酒店的评论进行分析,并输出每个酒店的关键词。
为了方便搜索引擎收录,建议将代码中的中文注释翻译成英文,并添加一些描述性的文字,例如:
- 代码的用途
- 代码的功能
- 如何使用代码
- 代码的局限性
此外,还可以将代码分成不同的模块,并使用一些专业的词汇来描述每个模块的功能,例如:
- Data Preprocessing
- TF-IDF Model Training
- Keyword Extraction
通过以上方法,可以提高代码的可读性和可搜索性,方便搜索引擎收录。
原文地址: http://www.cveoy.top/t/topic/ojGC 著作权归作者所有。请勿转载和采集!