C++ 英文文本单词统计：基础版和进阶版实现

本程序使用 C++ 编程语言实现英文文本单词统计，包含基础版和进阶版两种实现方式。

需求

现在需要统计若干段文字（英文）中的单词数量，并且还需统计每个单词出现的次数。

注 1: 单词之间以空格（1 个或多个空格）为间隔。 注 2: 忽略空行或者空格行。

版本说明

基础版: 统计时，区分字母大小写，且不删除指定标点符号。

进阶版:

统计前，需要从文字中删除指定标点符号 !.,:*?。注意：所谓的删除，就是用 1 个空格替换掉相应字符。
统计单词时需要忽略单词的大小写。

输入说明

若干行英文，最后以 !!!!! 为结束。

输出说明

单词数量
出现次数排名前 10 的单词（次数按照降序排序，如果次数相同，则按照键值的字母升序排序）及出现次数。

基础版实现

#include <iostream>
#include <string>
#include <map>
using namespace std;

int main() {
    string line;
    map<string, int> word_count;
    int total_count = 0;

    // 读入文字直到遇到'!!!!'为止
    while (getline(cin, line)) {
        if (line == "!!!!!") {
            break;
        }
        // 如果是空行或空格行则忽略
        if (line.empty() || line.find_first_not_of(' ') == string::npos) {
            continue;
        }
        // 统计单词数量
        int start = line.find_first_not_of(' ');
        while (start != string::npos) {
            int end = line.find_first_of(' ', start);
            if (end == string::npos) {
                end = line.size();
            }
            string word = line.substr(start, end - start);
            ++word_count[word];
            ++total_count;
            start = line.find_first_not_of(' ', end);
        }
    }

    // 输出单词数量
    cout << "Total words: " << total_count << endl;

    // 输出出现次数排名前 10 的单词
    multimap<int, string, greater<int>> count_word;
    for (auto iter = word_count.begin(); iter != word_count.end(); ++iter) {
        count_word.insert(make_pair(iter->second, iter->first));
    }
    cout << "Top 10 words:" << endl;
    int count = 0;
    for (auto iter = count_word.begin(); iter != count_word.end() && count < 10; ++iter, ++count) {
        cout << iter->second << ": " << iter->first << endl;
    }

    return 0;
}

进阶版实现

#include <iostream>
#include <string>
#include <map>
#include <algorithm>
using namespace std;

// 删除指定标点符号
void remove_punct(string& str) {
    string punct = "!.,:*?";
    for (char c : punct) {
        replace(str.begin(), str.end(), c, ' ');
    }
}

// 将字符串转为小写
void to_lowercase(string& str) {
    transform(str.begin(), str.end(), str.begin(), ::tolower);
}

// 比较两个 pair 的大小
bool cmp(const pair<string, int>& p1, const pair<string, int>& p2) {
    if (p1.second != p2.second) {
        return p1.second > p2.second;
    } else {
        return p1.first < p2.first;
    }
}

int main() {
    string line;
    map<string, int> word_count;
    int total_count = 0;

    // 读入文字直到遇到'!!!!'为止
    while (getline(cin, line)) {
        if (line == "!!!!!") {
            break;
        }
        // 如果是空行或空格行则忽略
        if (line.empty() || line.find_first_not_of(' ') == string::npos) {
            continue;
        }
        // 删除标点符号并转为小写
        remove_punct(line);
        to_lowercase(line);
        // 统计单词数量
        int start = line.find_first_not_of(' ');
        while (start != string::npos) {
            int end = line.find_first_of(' ', start);
            if (end == string::npos) {
                end = line.size();
            }
            string word = line.substr(start, end - start);
            ++word_count[word];
            ++total_count;
            start = line.find_first_not_of(' ', end);
        }
    }

    // 输出单词数量
    cout << "Total words: " << total_count << endl;

    // 输出出现次数排名前 10 的单词
    vector<pair<string, int>> count_word;
    for (auto iter = word_count.begin(); iter != word_count.end(); ++iter) {
        count_word.push_back(make_pair(iter->first, iter->second));
    }
    sort(count_word.begin(), count_word.end(), cmp);
    cout << "Top 10 words:" << endl;
    int count = 0;
    for (auto iter = count_word.begin(); iter != count_word.end() && count < 10; ++iter, ++count) {
        cout << iter->first << ": " << iter->second << endl;
    }

    return 0;
}