C语言实现文件夹遍历并分组相同内容的文件

这个问题可以分为两个部分来解决：首先需要遍历文件夹，然后需要找出内容完全相同的文件。

遍历文件夹可以使用 dirent.h 和 sys/stat.h 库来实现，具体步骤如下：

打开文件夹，获取文件夹的文件描述符。
读取文件夹中的文件，使用 readdir() 函数来获取文件名。
对于每个文件，使用 stat() 函数来获取文件属性，判断是否为文件夹。
如果是文件夹，则递归调用自身来遍历该文件夹。
如果是文件，则将文件名和文件属性存储到一个数组中。

找出内容完全相同的文件可以使用 md5 算法来实现，具体步骤如下：

对于每个文件，使用 fopen() 函数打开文件，然后使用 fread() 函数读取文件内容。
使用 md5 算法计算文件内容的哈希值，可以使用 OpenSSL 库中的 MD5() 函数来实现。
将哈希值和文件名存储到一个哈希表中。
遍历哈希表，将哈希值相同的文件名分为一组。

下面是一个简单的示例代码，可以遍历指定的文件夹，并找出内容完全相同的文件。

#include <stdio.h>
#include <dirent.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <string.h>

#define MAX_FILES 1000
#define MD5_DIGEST_LENGTH 16

typedef struct {
    char name[256];
    struct stat st;
} file_info;

typedef struct {
    unsigned char md5[MD5_DIGEST_LENGTH];
    int count;
    char **files;
} md5_info;

int is_directory(const char *path) {
    struct stat st;
    if (stat(path, &st) == 0) {
        return S_ISDIR(st.st_mode);
    }
    return 0;
}

void traverse_directory(const char *path, file_info *files, int *count) {
    DIR *dir = opendir(path);
    if (dir == NULL) {
        return;
    }
    struct dirent *entry;
    while ((entry = readdir(dir)) != NULL) {
        char full_path[256];
        sprintf(full_path, '%s/%s', path, entry->d_name);
        if (strcmp(entry->d_name, '.') == 0 || strcmp(entry->d_name, '..') == 0) {
            continue;
        }
        if (is_directory(full_path)) {
            traverse_directory(full_path, files, count);
        } else {
            if (*count < MAX_FILES) {
                strcpy(files[*count].name, full_path);
                stat(full_path, &files[*count].st);
                (*count)++;
            }
        }
    }
    closedir(dir);
}

void md5_file(const char *path, unsigned char *md5) {
    FILE *file = fopen(path, 'rb');
    if (file == NULL) {
        return;
    }
    MD5_CTX ctx;
    MD5_Init(&ctx);
    char buffer[1024];
    int len;
    while ((len = fread(buffer, 1, sizeof(buffer), file)) > 0) {
        MD5_Update(&ctx, buffer, len);
    }
    MD5_Final(md5, &ctx);
    fclose(file);
}

void group_files(file_info *files, int count, md5_info *md5s, int *md5_count) {
    for (int i = 0; i < count; i++) {
        unsigned char md5[MD5_DIGEST_LENGTH];
        md5_file(files[i].name, md5);
        int found = 0;
        for (int j = 0; j < *md5_count; j++) {
            if (memcmp(md5, md5s[j].md5, MD5_DIGEST_LENGTH) == 0) {
                found = 1;
                md5s[j].count++;
                md5s[j].files = realloc(md5s[j].files, sizeof(char*) * md5s[j].count);
                md5s[j].files[md5s[j].count - 1] = files[i].name;
                break;
            }
        }
        if (!found) {
            md5_info md5_info;
            memcpy(md5_info.md5, md5, MD5_DIGEST_LENGTH);
            md5_info.count = 1;
            md5_info.files = malloc(sizeof(char*));
            md5_info.files[0] = files[i].name;
            md5s[*md5_count] = md5_info;
            (*md5_count)++;
        }
    }
}

int main(int argc, char **argv) {
    if (argc < 2) {
        printf('Usage: %s <directory>
', argv[0]);
        return 1;
    }
    char *path = argv[1];
    file_info files[MAX_FILES];
    int count = 0;
    traverse_directory(path, files, &count);
    md5_info md5s[MAX_FILES];
    int md5_count = 0;
    group_files(files, count, md5s, &md5_count);
    for (int i = 0; i < md5_count; i++) {
        if (md5s[i].count > 1) {
            printf('Group %d:
', i + 1);
            for (int j = 0; j < md5s[i].count; j++) {
                printf('%s
', md5s[i].files[j]);
            }
        }
    }
    return 0;
}