如题,手写词法分析器。
总览
词法分析,就是将输入分解成一个个独立的词法符号,称为单词符号,即token。token可以理解成程序设计语言中具有独立含义的最小词法单位,在这个意义上和自然语言的单词含义有点像。但是,token不仅包括一般意义上的单词,还包括标点符号、操作符、分割符等等。
我选择sql的常用语句作为输入语言,进行词法分析。主要的过程如下:
- 梳理语言内容,书写必要的正则表达式
- 将正则表达式转换成NFA
- 将NFA转换成DFA
- 用代码实现DFA
梳理语言内容,书写必要的正则表达式
分析所选的语言,整理其中的类型,书写对应的正则表达式。整理结果如下,正则表达式都比较简单,我就不说了。
语言内容的梳理结果
关键字:create, table, int, not, null, char, float,primary, key,index, on, drop, select, from, where, and, insert, into, values, delete, quit, execfile
符号:( ) , ; = <> < > <= >=
数字:123, 456.789
字符串: 'abcd\r\n\\ef23&*()'
标识符:[a-zA-Z_][a-zA-Z0-9_]*
注释: #开头 \n结尾
涵盖的sql语法
# DataType: int float char(n)
# (1<=n<=255)
# only one line
# OPType: = <> < > <= >=
create table [tablename] (
[attribute] int not null,
[attribute] char(20),
[attribute] float,
primary key([attribute])
);
# max : 32 attributes
create index [indexname] on [tablename(attribute)];
drop table [tablename];
drop index [indexname];
select [attribute],[attribute] from [tablename] where [attribute] > [number] and [attribute] < [number];
insert into [tablename] ([attribute list])values([values list]);
delete from [tablename] where [attribute] > [number];
quit;
execfile [filename];
将正则表达式转换成NFA
正则表达式是不能被计算机理解的,所以我们需要把它转换成DFA,中间以NFA作为过度。这部分的主要知识是如何将正则表达式转换成NFA。
直接的做法和简单的理论,可以参考轮子哥的 《构建可配置的词法分析器》;
形式化的定义和相关的理论知识,可以参考 《Elements of the Theory of Computation》第二章前3节的内容。
将NFA转换成DFA
NFA转换成DFA可以参考《Elements of the Theory of Computation》第二章第2节的内容。
用代码实现DFA
我最后得到的有限自动机如下,可以发现,指向End状态我使用了虚线,这是因为End状态其实是不存在的,它就是Start状态。最下面的长短间隔虚线部分是NFA而不是DFA,是因为处理关键字和标识符的有限自动机有些繁琐,每一个关键字都需要关键字字符数个状态,代码写起来实在是太臃肿,我小小hack了一下,直接用了NFA来处理。
万事具备了,只欠写代码了。具体的过程如下。
- 用代码定义途中的状态和类型
有些出口只有虚线的状态是不需要的,就像注释掉的几个状态。
enum class State{
Start,
InInteger,
InFloat,
InStringHalf,
InEscape,
//InString,
InIdentifier,
InComment,
//InCommentEnd,
//InOperator,
//InKeyword,
InKeyorId
};
enum class TokenType{
Unknown,
Integer,
Float,
String,
Identifier,
Comment,
Operator,
Keyword
};
- 定义Token的存储结构, 作为编译器,我们不仅需要得到最终的一个个token,还需要在报错时指出错误位置
struct Token{
int line; //token所在的行
int column; //token所在的列
TokenType type; //token的类型
std::string value; //token的内容
Token(int _line,int _column,TokenType _type,const char* b,const char* e)
:line(_line),column(_column),type(_type),value(b,e){
}
};
- 通过下面的方式,把状态图转成代码
//define:
//char * pos为指向当前处理字符的指针
//State state为当前的状态
while(*pos){
switch(state){
case 某个状态:{
switch(*pos){
case 某个字符:{
//根据状态图的情况进行不同的处理
break;
}
}
break;
}
...
}
}
运行实例
实例程序
std::string s("create table tname(\nid int,\nmark float,\nname char(100));\n"
"insert into tname (id,mark,name) values (1,15.6,'cmh');");
sql::Lex lex;
try{
lex.parse(s);
}catch(std::exception &e){
print(e.what());
return 0;
}
std::vector<sql::Token> res = lex.getTokens();
print4(res);
lex.clear();
运行结果:
Value Type Line Column
create Keyword 1 6
table Keyword 1 12
tname Identifier 1 18
( Operator 1 18
id Identifier 2 2
int Keyword 2 6
, Operator 2 6
mark Identifier 3 4
float Keyword 3 10
, Operator 3 10
name Identifier 4 4
char Keyword 4 9
( Operator 4 9
100 Integer 4 13
) Operator 4 13
) Operator 4 14
; Operator 4 15
insert Keyword 5 6
int Keyword 5 10
o Identifier 5 11
tname Identifier 5 17
( Operator 5 18
id Identifier 5 21
, Operator 5 21
mark Identifier 5 26
, Operator 5 26
name Identifier 5 31
) Operator 5 31
values Keyword 5 39
( Operator 5 40
1 Integer 5 42
, Operator 5 42
15.6 Float 5 47
, Operator 5 47
cmh String 5 52
) Operator 5 53
; Operator 5 54
完整代码
#pragma once
#include <iostream>
#include <string>
#include <vector>
#include <exception>
namespace sql{
enum class State{
Start,
InInteger,
InFloat,
InStringHalf,
InEscape,
//InString,
InIdentifier,
InComment,
//InCommentEnd,
//InOperator,
//InKeyword,
InKeyorId
};
enum class TokenType{
Unknown,
Integer,
Float,
String,
Identifier,
Comment,
Operator,
Keyword
};
class myexception: public std::exception{
std::string s;
public:
myexception(const std::string &_s):s(_s){ }
virtual const char* what() const throw(){
return s.c_str();
}
};
struct Token{
int line;
int column;
TokenType type;
std::string value;
Token(int _line,int _column,TokenType _type,const char* b,const char* e)
:line(_line),column(_column),type(_type),value(b,e){
}
};
class Lex{
struct CurrentState{
State state;
int line;
int column;
const char * tstart;
const char * tend;
CurrentState(State s,int l,int c,const char *ts,const char *te)
:state(s),line(l),column(c),tstart(ts),tend(te){
}
};
std::vector<Token> tokens;
public:
void clear(){
tokens.clear();
}
void parse(std::string & source){
CurrentState current(State::Start,1,1,source.c_str(),source.c_str());
while(*current.tend){
switch(current.state){
case State::Start:{
current.tstart = current.tend;
switch(*current.tend){
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':{
current.state = State::InInteger;
break;
}
case '\'':{
current.state = State::InStringHalf;
current.tstart++;
break;
}
case '#':{
current.state = State::InComment;
current.tstart++;
break;
}
case '(':case ')':case ',':case ';':case '=':{
//current.state = State::Start;
current.tend++;
push(current,TokenType::Operator);
current.tend--;
break;
}
case '<':case '>':{
//current.state = State::Start;
if(*(current.tend+1)=='='){
current.tend += 2;
current.column += 2;
push(current,TokenType::Operator);
current.tend--;
current.column--;
}else{
current.tend++;
push(current,TokenType::Operator);
current.tend--;
}
break;
}
case 'a':{
handleKeyword({"and"},current,source);
break;
}
case 'c':{
handleKeyword({"create","char"},current,source);
break;
}
case 'd':{
handleKeyword({"delete","drop"},current,source);
break;
}
case 'e':{
handleKeyword({"execfile"},current,source);
break;
}
case 'f':{
handleKeyword({"float","from"},current,source);
break;
}
case 'i':{
handleKeyword({"insert","index","int","into"},current,source);
break;
}
case 'k':{
handleKeyword({"key"},current,source);
break;
}
case 'n':{
handleKeyword({"not","null"},current,source);
break;
}
case 'o':{
handleKeyword({"on"},current,source);
break;
}
case 'p':{
handleKeyword({"primary"},current,source);
break;
}
case 'q':{
handleKeyword({"quit"},current,source);
break;
}
case 's':{
handleKeyword({"select"},current,source);
break;
}
case 't':{
handleKeyword({"table"},current,source);
break;
}
case 'v':{
handleKeyword({"values"},current,source);
break;
}
case 'w':{
handleKeyword({"where"},current,source);
break;
}
case 'b':
case 'g':case 'h':case 'j':
case 'l':case 'm':
case 'r':
case 'u':case 'x':case 'y':
case 'z':case 'A':case 'B':case 'C':case 'D':
case 'E':case 'F':case 'G':case 'H':case 'I':
case 'J':case 'K':case 'L':case 'M':case 'N':
case 'O':case 'P':case 'Q':case 'R':case 'S':
case 'T':case 'U':case 'V':case 'W':case 'X':
case 'Y':case 'Z':case '_':{
current.state = State::InIdentifier;
break;
}
case '\n':{
//current.state = State::Start;
current.line++;
current.column = 0;
break;
}
case ' ':{
break;
}
case '\t':{
break;
}
default:{
throw myexception("error in start state");
break;
}
}
break;
}
case State::InKeyorId:{
switch(*current.tend){
case 'a':case 'b':case 'c':case 'd':case 'e':
case 'f':case 'g':case 'h':case 'i':case 'j':
case 'k':case 'l':case 'm':case 'n':case 'o':
case 'p':case 'q':case 'r':case 's':case 't':
case 'u':case 'v':case 'w':case 'x':case 'y':
case 'z':case 'A':case 'B':case 'C':case 'D':
case 'E':case 'F':case 'G':case 'H':case 'I':
case 'J':case 'K':case 'L':case 'M':case 'N':
case 'O':case 'P':case 'Q':case 'R':case 'S':
case 'T':case 'U':case 'V':case 'W':case 'X':
case 'Y':case 'Z':case '_':{
current.state = State::InIdentifier;
break;
}
default:{
throw myexception("InKeyorId error");
break;
}
}
break;
}
case State::InInteger:{
switch(*current.tend){
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':{
break;
}
case '.':{
current.state = State::InFloat;
break;
}
default:{
current.state = State::Start;
push(current,TokenType::Integer);
current.tend--;
current.column--;
break;
}
}
break;
}
case State::InFloat:{
switch(*current.tend){
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':{
break;
}
default:{
current.state = State::Start;
push(current,TokenType::Float);
current.tend--;
current.column--;
break;
}
}
break;
}
case State::InStringHalf:{
switch(*current.tend){
case '\\':{
current.state = State::InEscape;
break;
}
case '\'':{
current.state = State::Start;
push(current,TokenType::String);
break;
}
case '\n':{
throw myexception("unexpected \\n in string");
break;
}
default:{
break;
}
}
break;
}
case State::InEscape:{
switch(*current.tend){
case 'n':case '\'':case '\\':{
current.state = State::InStringHalf;
break;
}
default:{
throw myexception("undefined escape");
break;
}
}
break;
}
case State::InIdentifier:{
switch(*current.tend){
case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
case 'a':case 'b':case 'c':case 'd':case 'e':
case 'f':case 'g':case 'h':case 'i':case 'j':
case 'k':case 'l':case 'm':case 'n':case 'o':
case 'p':case 'q':case 'r':case 's':case 't':
case 'u':case 'v':case 'w':case 'x':case 'y':
case 'z':case 'A':case 'B':case 'C':case 'D':
case 'E':case 'F':case 'G':case 'H':case 'I':
case 'J':case 'K':case 'L':case 'M':case 'N':
case 'O':case 'P':case 'Q':case 'R':case 'S':
case 'T':case 'U':case 'V':case 'W':case 'X':
case 'Y':case 'Z':case '_':{
break;
}
default:{
current.state = State::Start;
push(current,TokenType::Identifier);
current.tend--;
current.column--;
break;
}
}
break;
}
case State::InComment:{
switch(*current.tend){
case '\n':{
current.state = State::Start;
push(current,TokenType::Comment);
current.line++;
current.column = 0;
break;
}
default:{
break;
}
}
break;
}
default:{
throw myexception("undefined state");
break;
}
}
current.tend++;
current.column++;
}
}
const std::vector<Token> & getTokens(){
return tokens;
}
private:
void push(const CurrentState & current,TokenType type){
tokens.push_back(Token(current.line, current.column-1, type, current.tstart, current.tend));
}
void handleKeyword(std::vector<std::string> && keys,CurrentState ¤t,std::string &source){
int spos = current.tstart - source.c_str();
for(int i=0;i<keys.size();++i){
int ks = keys[i].size();
if(source.substr(spos,ks)==keys[i]){
current.state = State::Start;
current.tend += ks;
current.column += ks;
push(current,TokenType::Keyword);
current.tend--;
current.column--;
return ;
}
}
current.state = State::InKeyorId;
current.tend--;//here might be left of string,but no influence
current.column--;
}
};
}