一、实验目的
通过本实验的编程实践,使学生了解词法分析的任务,掌握词法分析程序设计的原理和构造方法,使学生对编译的基本概念、原理和方法有完整的和清楚的理解,并能正确地、熟练地运用。
二、实验内容
用 VC++/VB/JAVA 语言实现对 C 语言子集的源程序进行词法分析。通过输入源程序从左到右对字符串进行扫描和分解,依次输出各个单词的内部编码及单词符号自身值;若遇到错误则显示“Error”,然后跳过错误部分继续显示 ;同时进行标识符登记符号表的管理。
三、程序描述
程序达到的状态转换图如下:
package exp1;
import java.util.*;
import java.io.IOException;
import java.io.StringReader;
import static java.lang.Character.isDigit;
import static java.lang.Character.isLetter;
import static java.lang.Character.isLetterOrDigit;
import static java.lang.Character.isWhitespace;
/**
* 词法分析器
*
* @version 2020-05-18
*/
public class Tokenizer {
/** 读入的字符序列 */
private final StringReader src;
/** Tokenizer的状态是否已经分析过 */
private boolean hasAnalysised;
/** 表示当前是否到达结尾 */
private boolean eof;
/** 字符当前所处的行数 */
private int line;
/** 字符当前行所处的位置 */
private int character;
/** 当前字符在字符序列中的位置,实际应和StringReader的私有属性next相同 */
private int index;
/** 当前字符的前一个字符 */
private char previous;
/** 标志当前是不是已经使用了前一个字符 */
private boolean usePrevious;
/** 上一行的长度 */
private int characterPreviousLine;
/** 关键词表 */
final String[] keywords = new String[] { "do", "end", "for", "if", "printf", "scanf", "then", "while" };
/** 关键词集合,为了快速判断 */
final Set<String> keywordsSet = new HashSet<>(Arrays.asList(keywords));
/** 分隔符起始字符的集合 */
final Set<Character> delimitersSet = new HashSet<>(Arrays.asList(',', ';', '(', ')', '[', ']'));
/** 算数运算符起始字符的集合 */
final Set<Character> operatorsASet = new HashSet<>(Arrays.asList('+', '-', '*', '/'));
/** 常量列表 */
final List<Number> constants = new ArrayList<>();
/** 标识符列表 */
final Set<String> identifiers = new LinkedHashSet<>();
/**
* 词法分析器的构造器
*
* @param src 源代码
*/
public Tokenizer(String src) {
this.src = new StringReader(src);
this.hasAnalysised = false;
this.line = this.character = 1;
this.index = this.characterPreviousLine = 0;
this.previous = 0;
this.eof = this.usePrevious = false;
}
/**
* 检查是否已经分析过
*
* @return 如果已经分析过,返回true
*/
public boolean hasAnalysised() {
return hasAnalysised;
}
/**
* 读取出一个字符串
*
* @return 读取出的字符串
* @throws IOException
*/
private String nextString() throws IOException {
StringBuilder sb = new StringBuilder();
char c;
for (;;) {
c = this.next();
/*
* 以字母开头不是关键词就是标识符,该方法读出一个符合正规式/[A-Za-z]([A-Za-z0-9]*)/的字符串
* 即以字母开头,字母和数字匹配零次或多次的字符串
*/
if (isLetterOrDigit(c)) {
sb.append(c);
} else if (c == 0) {
return sb.toString();
} else {
this.back();
return sb.toString();
}
}
}
/**
* 读取出一个数字字符串
*
* @return 读取出的数字字符串
* @throws IOException
*/
private String nextNumberString() throws IOException {
StringBuilder sb = new StringBuilder();
char c;
for (;;) {
c = this.next();
/*
* 以数字开头可能为常量,或者为出错的单词,此方法读取出一个符合正规式/([0-9])([0-9.]*)/,即以0-9开头,0-9和.匹配零次或多次的字符串
* 对于末尾是.的或者是出现多次.的,表示出现错误,交由numberAnalysis方法处理
*/
if (isDigit(c)) {
sb.append(c);
} else if (c == '.') {
sb.append(c);
} else if (c == 0) {
return sb.toString();
} else {
this.back();
return sb.toString();
}
}
}
/**
* 解析出字符串表示的数字
*
* @param numString 数字字符串
* @return 数字字符串代表的值
*/
private Number parseNumber(String numString) {
/* 传进来的字符串,不是标准的整数,就是标准的浮点数,不会发生NumberFormatException */
try {
return new Integer(numString);
} catch (NumberFormatException e) {
return new Double(numString);
}
}
/**
* 获取下一个非空白字符
*
* @return 到达末尾后返回0,下一个非空白字符
* @throws IOException
*/
private char nextClean() throws IOException {
for (;;) {
char c = this.next();
if (c == 0 || !isWhitespace(c)) {
return c;
}
}
}
/**
* 获取源代码的下一个字符
*
* @return 到达末尾后返回0,否则返回源代码的下一个字符
* @throws IOException
*/
private char next() throws IOException {
int c;
/* 如果回退过,返回回退的字符 */
if (this.usePrevious) {
this.usePrevious = false;
c = this.previous;
} else {
/* 否则,读取新的字符 */
c = this.src.read();
}
/* 判断是不是到达结尾 */
if (c <= 0) { // 到达末尾
this.eof = true;
return 0;
}
this.incrementIndexes(c);
this.previous = (char) c;
return this.previous;
}
/**
* 增加{@link #index}并处理换行,此方法仅应当在{@link #next()}中调用
*
* @param c 读入的字符
*/
private void incrementIndexes(int c) {
if (c > 0) {
this.index++;
/* 因为运行系统不同,行尾序列会有'\n' (*inx, Mac OS 10+), '\r\n' (Windows), '\r' (Mac OS 9-),需做处理*/
if (c == '\r') {
this.line++;
this.characterPreviousLine = this.character;
this.character = 0;
} else if (c == '\n') {
if (this.previous != '\r') {
this.line++;
this.characterPreviousLine = this.character;
}
this.character = 0;
} else {
this.character++;
}
}
}
/**
* 回退一个字符,用于实现超前扫描。
*
* @throws IOException 如果已经在字符串起始,或者已经回退过一次
*/
public void back() throws IOException {
if (this.usePrevious) {
throw new IOException("回退两次不允许");
} else if (this.index <= 0) {
throw new IOException("已经在开头位置,无法回退");
}
this.decrementIndexes();
this.usePrevious = true;
this.eof = false;
}
/**
* 回退{@link #index}并处理换行,此方法仅应当在{@link #back()}中调用
*/
private void decrementIndexes() {
this.index--;
if (this.previous == '\r' || this.previous == '\n') {
this.line--;
this.character = this.characterPreviousLine;
} else if (this.character > 0) {
this.character--;
}
}
/**
* 进行词法分析,返回分析列表
*
* @return 单词分析结果的列表
* @throws IOException
*/
public List<WordExp> analysis() throws IOException {
this.hasAnalysised = true;
List<WordExp> res = new ArrayList<>();
int c;
/* c为0表示已经分析到结尾,采用 nextClean 方法略过空白字符 */
while ((c = nextClean()) != 0) {
if (isLetter(c)) {
this.back();
this.analysisString(res); /* 处理字母开头 */
} else if (isDigit(c)) {
this.back();
this.analysisNumber(res); /* 处理数字开头 */
} else {
this.back();
this.analysisOther(res); /* 处理其他字符开头 */
}
}
return res;
}
/**
* 分析以字母开头的符号串
*
* @param res 分析结果列表
* @throws IOException
*/
private void analysisString(List<WordExp> res) throws IOException {
/* 读出一个以字母开头,之后含有字母和数字零次或多次的字符串 */
String cur = this.nextString();
/* 如果读出的字符串为关键词 */
if (keywordsSet.contains(cur)) {
res.add(new WordExp(cur, WordType.KEYWORD, this.line, this.character));
} else if (identifiers.contains(cur)) {
/* 如果读出的字符串已经存在在标识符列表中 */
res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
} else {
/* 如果不存在在标识符列表中,假如标识符列表 */
identifiers.add(cur);
res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
}
}
/**
* 分析以数字开头的符号串
*
* @param res 分析结果列表
* @throws IOException
*/
private void analysisNumber(List<WordExp> res) throws IOException {
/* 读出一个以数字开头,之后含有数字和.零次或多次的字符串 */
String numString = this.nextNumberString();
/* 超前扫描下一个字符 */
char c = this.next();
if (!isLetter(c)) {
if (c != 0)
this.back();
/* 如果以.结尾,表示出现了123.这种错误 */
if (numString.endsWith(".")) {
res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
return;
}
/* 如果字符串含有两个及以上的.,说明出现了类似123.123.123这种错误 */
else if (numString.indexOf('.') != numString.lastIndexOf('.')) {
res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
return;
}
/* 否则,是正常的常数串 */
constants.add(parseNumber(numString));
res.add(new WordExp(numString, WordType.CONSTANT, this.line, this.character));
return;
}
/* 如果下一个字符是字母的话,说明会出现形如123abc这种类似的错误*/
this.back();
/* 读取出后面的那个部分字符串 */
String cur = this.nextString();
cur = numString + cur;
res.add(new WordExp(cur, WordType.ERROR, this.line, this.character));
}
/**
* 分析以非字母和数字开头的符号串
*
* @param res 分析结果列表
* @throws IOException
*/
private void analysisOther(List<WordExp> res) throws IOException {
char c = this.next();
char c2;
if (c == '/') {
/* 如果以/开头,表示可能是注释,除法符号 */
c2 = this.next();
/* 如果下一个字符是/,表示为注释,而且注释到下一个回车或者结束,不用记录 */
if (c2 == '/') {
for (;;) {
c2 = this.next();
if (c2 == '\r' || c2 == '\n' || c2 == '0')
return;
}
/* 如果下一个字符是*,表示为注释,注释到下一个*和/,不用记录 */
} else if (c2 == '*') {
StringBuilder con = new StringBuilder("/*");
for (;;) {
c2 = this.next();
con.append(c2);
/* 一直读取,直到*和/ */
if (c2 == '*') {
c2 = this.next();
con.append(c2);
if (c2 == '/')
return;
else
continue;
/* 但是如果未找到匹配的*和/就达到了结尾,表示注释出错,要添加记录 */
} else if (c2 == 0) {
res.add(new WordExp(con.toString(), WordType.ERROR, this.line, this.character));
return;
}
}
/* 如果不是/和*,表示是除法符号 */
} else {
res.add(new WordExp("/", WordType.OPERATOR_A, this.line, this.character));
return;
}
/* 如果读取到的符号是分隔符 */
} else if (delimitersSet.contains(c)) {
res.add(new WordExp(c + "", WordType.DELIMITER, this.line, this.character));
return;
}
/* 如果读取到的符号是算数运算符 */
else if (operatorsASet.contains(c)) {
res.add(new WordExp(c + "", WordType.OPERATOR_A, this.line, this.character));
return;
/* 如果是逻辑运算符,因为逻辑运算符包含<=和>=以及<>,也要超前搜索 */
} else if (c == '<') {
c2 = this.next();
/* 如果是<> */
if (c2 == '>') {
res.add(new WordExp("<>", WordType.OPERATOR_L, this.line, this.character));
return;
/* 如果是<= */
} else if (c2 == '=') {
res.add(new WordExp("<=", WordType.OPERATOR_L, this.line, this.character));
return;
} else {
/* 如果是< */
this.back();
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
}
} else if (c == '>') {
c2 = this.next();
/* 如果是>= */
if (c2 == '=') {
res.add(new WordExp(">=", WordType.OPERATOR_L, this.line, this.character));
return;
/** 如果是> */
} else {
this.back();
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
}
/* 如果是= */
} else if (c == '=') {
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
/* 如果什么也不是,出错 */
} else {
res.add(new WordExp(c + "", WordType.ERROR, this.line, this.character));
return;
}
}
public static void main(String[] args) throws IOException {
Scanner scanner = new Scanner(System.in);
scanner.useDelimiter("(\r\n\r\n)|(\n\n)|(\r\r)");
String src = scanner.next();
Tokenizer tokenizer = new Tokenizer(src);
List<WordExp> wordExps = tokenizer.analysis();
// System.out.println(wordExps);
System.out.printf("%-16s%-16s%-16s%-16s", "单词", "二元序列", "类 型", "位置(行,列)");
System.out.println();
System.out.printf("%-16s(单词种别,单词属性)", "");
System.out.println();
for (WordExp word : wordExps) {
if (word.type != WordType.ERROR) {
System.out.printf("%-16s%-16s%-16s%-16s", word.word, "(" + word.type.id + ", " + word.word + ")",
word.type.type, "(" + word.line + ", " + word.column + ")");
System.out.println();
} else {
System.out.printf("%-16s%-16s%-16s%-16s", word.word, "Error", "Error",
"(" + word.line + ", " + word.column + ")");
System.out.println();
}
}
scanner.close();
}
@Override
public String toString() {
return " at " + this.index + " [row " + this.line + " column " + this.character + "]";
}
}
enum WordType {
/** 关键词 */
KEYWORD(1, "关键词"),
/** 分隔符 */
DELIMITER(2, "分隔符"),
/** 算数运算符 */
OPERATOR_A(3, "算术运算符"),
/** 关系运算符 */
OPERATOR_L(4, "关系运算符"),
/** 常量 */
CONSTANT(5, "常量"),
/** 标识符 */
IDENTIFIER(6, "标识符"),
/** 错误 */
ERROR(7, "错误");
public int id;
public String type;
private WordType(int id, String type) {
this.id = id;
this.type = type;
}
@Override
public String toString() {
return this.type;
}
};
final class WordExp {
/** 单词 */
final String word;
/** 单词类型 */
final WordType type;
/** 单词所处的行 */
final int line;
/** 单词所处的列 */
final int column;
public WordExp(String word, WordType type, int line, int column) {
this.word = word;
this.type = type;
this.line = line;
this.column = column;
}
@Override
public String toString() {
return type.toString() + word + ",出现在 " + " (" + this.line + "," + this.column + ")";
}
}