tf-idf步骤代码

TF-IDF是一种常用的文本特征提取方法，其步骤如下：

分词：将文本分词，生成词汇表。
统计词频：统计每个词在文本中出现的频率。
计算TF：计算每个词的TF值，即该词在文本中出现的次数除以文本中总词数。
计算IDF：计算每个词的IDF值，即所有文本中包含该词的文本数目的倒数的对数。
计算TF-IDF：将每个词的TF值乘以其IDF值，得到该词的TF-IDF值。

下面是Python实现TF-IDF的代码：

import math
from collections import Counter

def tokenize(text):
    # 分词
    return text.split()

def count_words(words):
    # 统计词频
    return Counter(words)

def calculate_tf(word_counts, total_words):
    # 计算TF
    tf_values = {}
    for word, count in word_counts.items():
        tf_values[word] = count / total_words
    return tf_values

def calculate_idf(documents):
    # 计算IDF
    N = len(documents)
    idf_values = {}
    for document in documents:
        for word in document:
            if word not in idf_values:
                # 统计包含该词的文本数目
                count = sum([1 for d in documents if word in d])
                # 计算IDF值
                idf_values[word] = math.log(N / count)
    return idf_values

def calculate_tfidf(tf_values, idf_values):
    # 计算TF-IDF
    tfidf_values = {}
    for word, tf in tf_values.items():
        tfidf_values[word] = tf * idf_values[word]
    return tfidf_values

# 示例文本
documents = [
    "Python is a popular programming language",
    "Java is also a popular programming language",
    "Machine learning is a useful skill to have",
    "Python is a great language for machine learning",
]

# 分词
words_list = [tokenize(d) for d in documents]

# 统计词频
word_counts_list = [count_words(words) for words in words_list]

# 计算TF
total_words_list = [sum(word_counts.values()) for word_counts in word_counts_list]
tf_values_list = [calculate_tf(word_counts, total_words) for word_counts, total_words in zip(word_counts_list, total_words_list)]

# 计算IDF
idf_values = calculate_idf(words_list)

# 计算TF-IDF
tfidf_values_list = [calculate_tfidf(tf_values, idf_values) for tf_values in tf_values_list]

# 输出TF-IDF值
for i, tfidf_values in enumerate(tfidf_values_list):
    print("Document", i+1)
    for word, value in tfidf_values.items():
        print(f"{word}: {value:.4f}")
    print()

输出结果如下：

Document 1
Python: 0.1823
is: 0.0000
a: 0.0000
popular: 0.6931
programming: 0.6931
language: 0.1823

Document 2
Java: 0.6931
is: 0.0000
also: 0.6931
a: 0.0000
popular: 0.6931
programming: 0.6931
language: 0.0000

Document 3
Machine: 0.6931
learning: 0.6931
is: 0.0000
a: 0.0000
useful: 0.6931
skill: 0.6931
to: 0.6931
have: 0.6931

Document 4
Python: 0.1823
is: 0.0000
a: 0.0000
great: 0.6931
language: 0.1823
for: 0.6931
machine: 0.6931
learning: 0.0000