Python 分词、去停用词和词频统计程序示例

当然可以！以下是一个示例程序，用于实现对输入文本的分词、去停用词和词频统计：

import jieba
from collections import Counter
import re

# 读取停用词表
def load_stopwords(stopwords_file):
    with open(stopwords_file, 'r', encoding='utf-8') as f:
        stopwords = set([line.strip() for line in f])
    return stopwords

# 进行分词和去停用词处理
def process_text(text, stopwords):
    # 使用jieba进行分词
    seg_list = jieba.cut(text)
    # 去除停用词和长度小于2的词
    filtered_words = [word for word in seg_list if word not in stopwords and len(word) > 1]
    return filtered_words

# 统计词频
def count_words(words):
    word_count = Counter(words)
    return word_count

# 读取文本文件
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# 主函数
def main():
    # 读取停用词表
    stopwords = load_stopwords('stopwords.txt')
    
    # 读取文本文件
    text = read_text_file('input.txt')
    
    # 进行分词和去停用词处理
    filtered_words = process_text(text, stopwords)
    
    # 统计词频
    word_count = count_words(filtered_words)
    
    # 输出词频结果
    for word, count in word_count.most_common():
        print(word, count)

if __name__ == "__main__":
    main()

请注意，这只是一个基本的示例程序，仅涵盖了分词、去停用词和词频统计的基本功能。根据实际需求，你可能需要进行适当的修改和调整。