TF-IDF 酒店评论关键词提取：80家三星级酒店评论分析

这段代码是对所有酒店的评论一起进行分析。可以将每个酒店的评论分别提取出来，分别进行处理，得到每个酒店的关键词。

import pymongo
import nltk
import functools
import math
from bson.objectid import ObjectId
from bson import json_util as jsonb
from nltk import word_tokenize,pos_tag
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

client = pymongo.MongoClient(host='localhost', port=27017)
db = client.dump

# #输出评论部分的数据

#分词
cutword1 = word_tokenize(results)


# 去除标点符号
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great']   #定义符号列表
cutwords2 = [word for word in cutword1 if word not in interpunctuations]

#判断分词在不在停用词列表内
stops = set(stopwords.words('english'))
cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]

tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
pos_tags =nltk.pos_tag(cutword3)
cutword4 = []
for word,pos in pos_tags:
        if (pos in tags):
            cutword4.append(word)
 #return ' '.join(ret)



doc_list = []
for cutword in cutword4:
    doc_list.append(PorterStemmer().stem(cutword))    #词干提取



#创建TXT文档
#f=open ('C:/users/Dell/Desktop/doc_list.txt', 'a+')


# idf值统计方法
def train_idf(doc_list):
    idf_dic = {}
    # 总文档数
    tt_count = len(doc_list)

    # 每个词出现的文档数
    for doc in doc_list:
        for word in set(doc):
            idf_dic[word] = idf_dic.get(word, 0.0) + 1.0

    # 按公式转换为idf值，分母加1进行平滑处理
    for k, v in idf_dic.items():
        idf_dic[k] = math.log(tt_count / (1.0 + v))

    # 对于没有在字典中的词，默认其仅在一个文档出现，得到默认idf值
    default_idf = math.log(tt_count / (1.0))
    return idf_dic, default_idf


#  排序函数，用于topK关键词的按值排序
def cmp(e1, e2):
    import numpy as np
    res = np.sign(e1[1] - e2[1])
    if res != 0:
        return res
    else:
        a = e1[0] + e2[0]
        b = e2[0] + e1[0]
        if a > b:
            return 1
        elif a == b:
            return 0
        else:
            return -1

# TF-IDF
class TfIdf(object):
    # 四个参数分别是：训练好的idf字典，默认idf值，处理后的待提取文本，关键词数量
    def __init__(self, idf_dic, default_idf, doc_list, keyword_num):
        self.doc_list = doc_list
        self.idf_dic, self.default_idf = idf_dic, default_idf
        self.tf_dic = self.get_tf_dic()
        self.keyword_num = keyword_num

    # 统计tf值
    def get_tf_dic(self):
        tf_dic = {}
        for word in self.doc_list:
            tf_dic[word] = tf_dic.get(word, 0.0) + 1.0

        tt_count = len(self.doc_list)
        for k, v in tf_dic.items():
            tf_dic[k] = float(v) / tt_count

        return tf_dic

    # 按公式计算tf-idf
    def get_tfidf(self):
        tfidf_dic = {}
        for word in self.doc_list:
            idf = self.idf_dic.get(word, self.default_idf)
            tf = self.tf_dic.get(word, 0)

            tfidf = tf * idf
            tfidf_dic[word] = tfidf



        tfidf_dic.items()
        # 根据tf-idf排序，去排名前keyword_num的词作为关键词
        for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
            print(k + '/ ', end='')
            print(v)
        print()



def tfidf_extract(doc_list, pos=False, keyword_num=25):
 #   doc_list = load_data(pos)
    idf_dic, default_idf = train_idf(doc_list)
    tfidf_model = TfIdf(idf_dic, default_idf, doc_list, keyword_num)
    tfidf_model.get_tfidf()


if __name__ == '__main__':

    print('TF-IDF模型结果：')
    tfidf_extract(doc_list)

要对每个酒店进行分析，可以将代码修改成以下形式：

import pymongo
import nltk
import functools
import math
from bson.objectid import ObjectId
from bson import json_util as jsonb
from nltk import word_tokenize,pos_tag
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

client = pymongo.MongoClient(host='localhost', port=27017)
db = client.dump

# 获取所有酒店的评论
hotels = db.hotels.find()

for hotel in hotels:
    # 提取每个酒店的评论
    reviews = hotel['reviews']

    # 对每个酒店的评论进行分析
    doc_list = []
    for review in reviews:
        # 分词
        cutword1 = word_tokenize(review)

        # 去除标点符号
        interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great']
        cutwords2 = [word for word in cutword1 if word not in interpunctuations]

        # 判断分词在不在停用词列表内
        stops = set(stopwords.words('english'))
        cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]

        # 词性标注
        tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
        pos_tags = nltk.pos_tag(cutword3)
        cutword4 = [word for word, pos in pos_tags if pos in tags]

        # 词干提取
        for cutword in cutword4:
            doc_list.append(PorterStemmer().stem(cutword))

    # 训练IDF模型
    idf_dic, default_idf = train_idf(doc_list)

    # 创建TF-IDF模型
    tfidf_model = TfIdf(idf_dic, default_idf, doc_list, 25)

    # 提取关键词
    print(f'酒店 {hotel['name']} 的关键词：')
    tfidf_model.get_tfidf()

# idf值统计方法
def train_idf(doc_list):
    idf_dic = {}
    # 总文档数
    tt_count = len(doc_list)

    # 每个词出现的文档数
    for doc in doc_list:
        for word in set(doc):
            idf_dic[word] = idf_dic.get(word, 0.0) + 1.0

    # 按公式转换为idf值，分母加1进行平滑处理
    for k, v in idf_dic.items():
        idf_dic[k] = math.log(tt_count / (1.0 + v))

    # 对于没有在字典中的词，默认其仅在一个文档出现，得到默认idf值
    default_idf = math.log(tt_count / (1.0))
    return idf_dic, default_idf


#  排序函数，用于topK关键词的按值排序
def cmp(e1, e2):
    import numpy as np
    res = np.sign(e1[1] - e2[1])
    if res != 0:
        return res
    else:
        a = e1[0] + e2[0]
        b = e2[0] + e1[0]
        if a > b:
            return 1
        elif a == b:
            return 0
        else:
            return -1

# TF-IDF
class TfIdf(object):
    # 四个参数分别是：训练好的idf字典，默认idf值，处理后的待提取文本，关键词数量
    def __init__(self, idf_dic, default_idf, doc_list, keyword_num):
        self.doc_list = doc_list
        self.idf_dic, self.default_idf = idf_dic, default_idf
        self.tf_dic = self.get_tf_dic()
        self.keyword_num = keyword_num

    # 统计tf值
    def get_tf_dic(self):
        tf_dic = {}
        for word in self.doc_list:
            tf_dic[word] = tf_dic.get(word, 0.0) + 1.0

        tt_count = len(self.doc_list)
        for k, v in tf_dic.items():
            tf_dic[k] = float(v) / tt_count

        return tf_dic

    # 按公式计算tf-idf
    def get_tfidf(self):
        tfidf_dic = {}
        for word in self.doc_list:
            idf = self.idf_dic.get(word, self.default_idf)
            tf = self.tf_dic.get(word, 0)

            tfidf = tf * idf
            tfidf_dic[word] = tfidf



        tfidf_dic.items()
        # 根据tf-idf排序，去排名前keyword_num的词作为关键词
        for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
            print(k + '/ ', end='')
            print(v)
        print()

这段代码会分别对每个酒店的评论进行分析，并输出每个酒店的关键词。

为了方便搜索引擎收录，建议将代码中的中文注释翻译成英文，并添加一些描述性的文字，例如：

代码的用途
代码的功能
如何使用代码
代码的局限性

此外，还可以将代码分成不同的模块，并使用一些专业的词汇来描述每个模块的功能，例如：

Data Preprocessing
TF-IDF Model Training
Keyword Extraction

通过以上方法，可以提高代码的可读性和可搜索性，方便搜索引擎收录。