python 两个中文文件内容对比查重高效率代码

以下是一个简单的实现，可以比较两个中文文件的内容，并计算出重复的部分的百分比。

import jieba
from collections import Counter

def compare_files(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        text1 = f1.read()
        text2 = f2.read()

    words1 = list(jieba.cut(text1))
    words2 = list(jieba.cut(text2))

    counter1 = Counter(words1)
    counter2 = Counter(words2)

    common_words = set(words1) & set(words2)
    total_words = set(words1) | set(words2)

    common_count = sum(min(counter1[word], counter2[word]) for word in common_words)
    total_count = sum(max(counter1[word], counter2[word]) for word in total_words)

    similarity = common_count / total_count * 100

    return similarity

file1 = 'file1.txt'
file2 = 'file2.txt'

similarity = compare_files(file1, file2)
print(f"重复部分的百分比：{similarity:.2f}%")

这个代码使用了jieba库进行中文分词，然后使用Counter类计算词频。通过计算两个文件的共同词和总词数，可以得到重复部分的百分比。请确保已经安装了jieba库。