C--语言词法分析器：基于GNU Flex的实现

本教程将指导您使用C语言和GNU Flex构建一个简单的C--语言词法分析器。该分析器将识别C--代码中的词法单元，并检测潜在的词法错误。

1.3.1 实验要求

您的程序需要识别以下类型的词法错误：

词法错误（错误类型 A）： 出现C--词法中未定义的字符，或者不符合C--词法单元定义的字符。例如，在C--代码中使用'~'字符。

1.3.2 输入格式

程序接受一个包含C--源代码的文本文件作为输入。您可以通过命令行参数指定输入文件名。例如：bash./cc test1.c--

其中，'cc'是您的程序名称，'test1.c--'是包含C--代码的输入文件。

1.3.3 输出格式

1.3.3.1 错误处理

如果输入文件包含词法错误，程序将输出以下格式的错误信息：

Error type [错误类型] at Line [行号]: [说明文字]

例如：

Error type A at Line 4: Mysterious character '~'.

1.3.3.2 词法单元输出

如果没有词法错误，程序将输出每个词法单元的名称和对应的词素，格式如下：

例如：

KEYWORD: ifIDENTIFIER: myVariableOPERATOR: +CONSTANT: 10

1.3.4 词法单元示例

下表列出了一些常见的C--词法单元、它们的非正式描述以及示例词素：

| 词法单元 | 非正式描述 | 词素示例 ||---|---|---|| if | 字符 'i', 'f' | if || else | 字符 'e', 'l', 's', 'e' | else || comparison | '<' 或 '>' 或 '<=' 或 '>=' 或 '==' 或 '!=' | <=, != || id | 字母开头的字母/数字串 | Pi, score, D2 || number | 任何数字常量 | 3.14159, 0, 6.02e23 || literal | 在两个单引号之间，除单引号以外的任何字符 | 'core dumped' |

1.3.5 词法单元类别

以下类别涵盖了大多数程序设计语言中的常见词法单元：

关键字： 例如 'if', 'else', 'while' 等。2. 运算符： 例如 '+', '-', '*', '/', '==', '<=' 等等。3. 标识符： 表示变量、函数和其他程序实体的名称。4. 常量： 例如数字 (10, 3.14), 字符串 ('hello') 等等。5. 界符： 例如 '(', ')', '{', '}', ';', ',' 等等。

1.3.6 代码示例

以下是使用C语言实现的简单词法分析器代码：c#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>

#define MAX_LEN 256

typedef enum { ERROR, KEYWORD, OPERATOR, IDENTIFIER, CONSTANT, DELIMITER} TokenType;

typedef struct { TokenType type; char lexeme[MAX_LEN];} Token;

void lexError(TokenType type, int lineNum, char *errorMsg) { printf('Error type %c at Line %d: %s ', type, lineNum, errorMsg);}

TokenType getTokenType(char *lexeme) { if (strcmp(lexeme, 'if') == 0 || strcmp(lexeme, 'else') == 0) { return KEYWORD; } else if (strcmp(lexeme, '<') == 0 || strcmp(lexeme, '>') == 0 || strcmp(lexeme, '<=') == 0 || strcmp(lexeme, '>=') == 0 || strcmp(lexeme, '==') == 0 || strcmp(lexeme, '!=') == 0) { return OPERATOR; } else if (isalpha(lexeme[0])) { return IDENTIFIER; } else if (isdigit(lexeme[0])) { return CONSTANT; } else { return DELIMITER; }}

void printToken(Token token) { char *type; switch (token.type) { case KEYWORD: type = 'KEYWORD'; break; case OPERATOR: type = 'OPERATOR'; break; case IDENTIFIER: type = 'IDENTIFIER'; break; case CONSTANT: type = 'CONSTANT'; break; case DELIMITER: type = 'DELIMITER'; break; default: type = 'ERROR'; break; } printf('%s: %s ', type, token.lexeme);}

void analyze(char *filename) { FILE *file = fopen(filename, 'r'); if (file == NULL) { printf('File not found. '); return; }

char line[MAX_LEN];    int lineNum = 1;

while (fgets(line, MAX_LEN, file)) {        char *token = strtok(line, '

'); while (token != NULL) { Token currentToken; strcpy(currentToken.lexeme, token); currentToken.type = getTokenType(token);

        if (currentToken.type == ERROR) {                lexError('A', lineNum, 'Unknown character');            } else {                printToken(currentToken);            }

        token = strtok(NULL, '

'); } lineNum++; }

fclose(file);}

int main(int argc, char *argv[]) { if (argc != 2) { printf('Usage: %s ', argv[0]); return 0; }

analyze(argv[1]);

return 0