C 语言实战——词法分析器

介绍

词法分析：对源文件进行扫描，将源文件的字符归类，划分为一个一个的记号（token）

运行效果

交互式的词法分析：

对 .c 文件的词法分析：

// hello.c
int main(void) {
    printf("hello world\n");
    return 0;
}

token 枚举类型

typedef enum {

    // single-character tokens
    TOKEN_LEFT_PAREN,   // '(', 0
    TOKEN_RIGHT_PAREN,  // ')', 1
    TOKEN_LEFT_BRACKET, // '[', 2
    TOKEN_RIGHT_BRACKET,// ']', 3
    TOKEN_LEFT_BRACE,   // '{', 4
    TOKEN_RIGHT_BRACE,  // '}', 5
    TOKEN_COMMA,        // ',', 6
    TOKEN_DOT,          // '.', 7
    TOKEN_SEMICOLON,    // ';', 8
    TOKEN_TILDE,        // '~', 9

    // one or two character tokens
    TOKEN_PLUS,           // '+',  10
    TOKEN_PLUS_PLUS,      // '++', 11
    TOKEN_PLUS_EQUAL,     // '+=', 12
    TOKEN_MINUS,          // '-',  13
    TOKEN_MINUS_MINUS,    // '--', 14
    TOKEN_MINUS_EQUAL,    // '-=', 15
    TOKEN_MINUS_GREATER,  // '->', 16
    TOKEN_STAR,           // '*',  17
    TOKEN_STAR_EQUAL,     // '*=', 18
    TOKEN_SLASH,          // '/',  19
    TOKEN_SLASH_EQUAL,    // '/=', 20
    TOKEN_PERCENT,        // '%',  21
    TOKEN_PERCENT_EQUAL,  // '%=', 22
    TOKEN_AMPER,          // '&',  23
    TOKEN_AMPER_EQUAL,    // '&=', 24
    TOKEN_AMPER_AMPER,    // '&&', 25
    TOKEN_PIPE,           // '|',  26
    TOKEN_PIPE_EQUAL,     // '|=', 27
    TOKEN_PIPE_PIPE,      // '||', 28
    TOKEN_HAT,            // '^',  29
    TOKEN_HAT_EQUAL,      // '^=', 30
    TOKEN_EQUAL,          // '=',  31
    TOKEN_EQUAL_EQUAL,    // '==', 32
    TOKEN_BANG,           // '!',  33
    TOKEN_BANG_EQUAL,     // '!=', 34
    TOKEN_LESS,           // '<',  35
    TOKEN_LESS_EQUAL,     // '<=', 36
    TOKEN_LESS_LESS,      // '<<', 37
    TOKEN_GREATER,        // '>',  38
    TOKEN_GREATER_EQUAL,  // '>=', 39
    TOKEN_GREATER_GREATER,// '>>', 40


    // literal
    TOKEN_IDENTIFIER,// 41
    TOKEN_CHARACTER, // 42
    TOKEN_STRING,    // 43
    TOKEN_NUMBER,    // 44

    // keywords
    TOKEN_SIGNED,  // 45
    TOKEN_UNSIGNED,// 46
    TOKEN_CHAR,    // 47
    TOKEN_SHORT,   // 48
    TOKEN_INT,     // 49
    TOKEN_LONG,    // 50
    TOKEN_FLOAT,   // 51
    TOKEN_DOUBLE,  // 52
    TOKEN_STRUCT,  // 53
    TOKEN_UNION,   // 54
    TOKEN_ENUM,    // 55
    TOKEN_VOID,    // 56
    TOKEN_IF,      // 57
    TOKEN_ELSE,    // 58
    TOKEN_SWITCH,  // 59
    TOKEN_CASE,    // 60
    TOKEN_DEFAULT, // 61
    TOKEN_WHILE,   // 62
    TOKEN_DO,      // 63
    TOKEN_FOR,     // 64
    TOKEN_BREAK,   // 65
    TOKEN_CONTINUE,// 66
    TOKEN_RETURN,  // 67
    TOKEN_GOTO,    // 68
    TOKEN_CONST,   // 69
    TOKEN_SIZEOF,  // 70
    TOKEN_TYPEDEF, // 71

    // others
    TOKEN_ERROR,// 72
    TOKEN_EOF   // 73

} TokenType;

Trie 树

用于区分 C 语言关键字和变量名

static TokenType identifierType() {
    int len = (int) (scanner.current - scanner.start);
    char c1 = scanner.start[0], c2 = ' ';
    if (len > 1) {
        c2 = scanner.start[1];
    }
    switch (c1) {
        case 'b':
            return checkKeyword(1, 4, "reak", TOKEN_BREAK);
        case 'c':
            switch (c2) {
                case 'a':
                    return checkKeyword(2, 2, "se", TOKEN_CASE);
                case 'h':
                    return checkKeyword(2, 2, "ar", TOKEN_CHAR);
                case 'o':
                    if (checkKeyword(2, 3, "nst", TOKEN_CONST) == TOKEN_CONST) {
                        return TOKEN_CONST;
                    } else if (checkKeyword(2, 6, "ntinue", TOKEN_CONTINUE) == 
                               TOKEN_CONTINUE) {
                        return TOKEN_CONTINUE;
                    }
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 'd':
            switch (c2) {
                case 'e':
                    return checkKeyword(2, 5, "fault", TOKEN_DEFAULT);
                case 'o':
                    if (checkKeyword(2, 4, "uble", TOKEN_DOUBLE) == TOKEN_DOUBLE) {
                        return TOKEN_DOUBLE;
                    } else if (checkKeyword(2, 0, "", TOKEN_DO) == TOKEN_DO) {
                        return TOKEN_DO;
                    }
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 'e':
            switch (c2) {
                case 'n':
                    return checkKeyword(2, 2, "um", TOKEN_ENUM);
                case 'l':
                    return checkKeyword(2, 2, "se", TOKEN_ELSE);
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 'f':
            switch (c2) {
                case 'l':
                    return checkKeyword(2, 3, "oat", TOKEN_FLOAT);
                case 'o':
                    return checkKeyword(2, 1, "r", TOKEN_FOR);
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 'g':
            return checkKeyword(1, 3, "oto", TOKEN_GOTO);
        case 'i':
            switch (c2) {
                case 'n':
                    return checkKeyword(2, 1, "t", TOKEN_INT);
                case 'f':
                    return checkKeyword(2, 0, "", TOKEN_IF);
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 'l':
            return checkKeyword(1, 3, "ong", TOKEN_LONG);
        case 'r':
            return checkKeyword(1, 5, "eturn", TOKEN_RETURN);
        case 's':
            switch (c2) {
                case 'h':
                    return checkKeyword(2, 3, "ort", TOKEN_SHORT);
                case 't':
                    return checkKeyword(2, 4, "ruct", TOKEN_STRUCT);
                case 'w':
                    return checkKeyword(2, 4, "itch", TOKEN_SWITCH);
                case 'i':
                    if (checkKeyword(2, 4, "gned", TOKEN_SIGNED) == TOKEN_SIGNED) {
                        return TOKEN_SIGNED;
                    } else if (checkKeyword(2, 4, "izeof", TOKEN_SIZEOF) == 
                               TOKEN_SIZEOF) {
                        return TOKEN_SIZEOF;
                    }
                default:
                    return TOKEN_IDENTIFIER;
            }
        case 't':
            return checkKeyword(1, 6, "ypedef", TOKEN_TYPEDEF);
        case 'v':
            return checkKeyword(1, 3, "oid", TOKEN_VOID);
        case 'w':
            return checkKeyword(1, 4, "hile", TOKEN_WHILE);
        case 'u':
            if (checkKeyword(1, 8, "nsigned", TOKEN_UNSIGNED) == TOKEN_UNSIGNED) {
                return TOKEN_UNSIGNED;
            } else if (checkKeyword(1, 4, "nion", TOKEN_UNION) == TOKEN_UNION) {
                return TOKEN_UNION;
            }
        default:
            return TOKEN_IDENTIFIER;
    }
}

刘司元 / C语言分词器

C 语言实战——词法分析器

介绍

运行效果

token 枚举类型

Trie 树

简介

发行版

贡献者

近期动态

刘司元 / C语言分词器 .gitee-modal { width: 500px !important; }

C 语言实战——词法分析器

介绍

运行效果

token 枚举类型

Trie 树

简介

发行版

贡献者

近期动态

搜索帮助

刘司元 / C语言分词器