编译原理实验一词法分析设计

一、实验目的

通过本实验的编程实践，使学生了解词法分析的任务，掌握词法分析程序设计的原理和构造方法，使学生对编译的基本概念、原理和方法有完整的和清楚的理解，并能正确地、熟练地运用。
二、实验内容

用 VC++/VB/JAVA 语言实现对 C 语言子集的源程序进行词法分析。通过输入源程序从左到右对字符串进行扫描和分解，依次输出各个单词的内部编码及单词符号自身值；若遇到错误则显示“Error”，然后跳过错误部分继续显示；同时进行标识符登记符号表的管理。
三、程序描述

程序达到的状态转换图如下：
状态转换图
package exp1;

import java.util.*;

import java.io.IOException;
import java.io.StringReader;
import static java.lang.Character.isDigit;
import static java.lang.Character.isLetter;
import static java.lang.Character.isLetterOrDigit;
import static java.lang.Character.isWhitespace;

/**
 * 词法分析器
 *
 * @version 2020-05-18
 */
public class Tokenizer {
    /** 读入的字符序列 */
    private final StringReader src;
    /** Tokenizer的状态是否已经分析过 */
    private boolean hasAnalysised;
    /** 表示当前是否到达结尾 */
    private boolean eof;
    /** 字符当前所处的行数 */
    private int line;
    /** 字符当前行所处的位置 */
    private int character;
    /** 当前字符在字符序列中的位置，实际应和StringReader的私有属性next相同 */
    private int index;
    /** 当前字符的前一个字符 */
    private char previous;
    /** 标志当前是不是已经使用了前一个字符 */
    private boolean usePrevious;
    /** 上一行的长度 */
    private int characterPreviousLine;
    /** 关键词表 */
    final String[] keywords = new String[] { "do", "end", "for", "if", "printf", "scanf", "then", "while" };
    /** 关键词集合，为了快速判断 */
    final Set<String> keywordsSet = new HashSet<>(Arrays.asList(keywords));
    /** 分隔符起始字符的集合 */
    final Set<Character> delimitersSet = new HashSet<>(Arrays.asList(',', ';', '(', ')', '[', ']'));
    /** 算数运算符起始字符的集合 */
    final Set<Character> operatorsASet = new HashSet<>(Arrays.asList('+', '-', '*', '/'));
    /** 常量列表 */
    final List<Number> constants = new ArrayList<>();
    /** 标识符列表 */
    final Set<String> identifiers = new LinkedHashSet<>();

    /**
     * 词法分析器的构造器
     *
     * @param src 源代码
     */
    public Tokenizer(String src) {
        this.src = new StringReader(src);
        this.hasAnalysised = false;
        this.line = this.character = 1;
        this.index = this.characterPreviousLine = 0;
        this.previous = 0;
        this.eof = this.usePrevious = false;
    }

    /**
     * 检查是否已经分析过
     *
     * @return 如果已经分析过，返回true
     */
    public boolean hasAnalysised() {
        return hasAnalysised;
    }

    /**
     * 读取出一个字符串
     *
     * @return 读取出的字符串
     * @throws IOException
     */
    private String nextString() throws IOException {
        StringBuilder sb = new StringBuilder();
        char c;
        for (;;) {
            c = this.next();
            /*
             * 以字母开头不是关键词就是标识符，该方法读出一个符合正规式/[A-Za-z]([A-Za-z0-9]*)/的字符串
             * 即以字母开头，字母和数字匹配零次或多次的字符串
             */
            if (isLetterOrDigit(c)) {
                sb.append(c);
            } else if (c == 0) {
                return sb.toString();
            } else {
                this.back();
                return sb.toString();
            }
        }
    }

    /**
     * 读取出一个数字字符串
     *
     * @return 读取出的数字字符串
     * @throws IOException
     */
    private String nextNumberString() throws IOException {
        StringBuilder sb = new StringBuilder();
        char c;
        for (;;) {
            c = this.next();
            /*
             * 以数字开头可能为常量，或者为出错的单词，此方法读取出一个符合正规式/([0-9])([0-9.]*)/，即以0-9开头，0-9和.匹配零次或多次的字符串
             * 对于末尾是.的或者是出现多次.的，表示出现错误，交由numberAnalysis方法处理
             */
            if (isDigit(c)) {
                sb.append(c);
            } else if (c == '.') {
                sb.append(c);
            } else if (c == 0) {
                return sb.toString();
            } else {
                this.back();
                return sb.toString();
            }
        }
    }

    /**
     * 解析出字符串表示的数字
     *
     * @param numString 数字字符串
     * @return 数字字符串代表的值
     */
    private Number parseNumber(String numString) {
        /* 传进来的字符串，不是标准的整数，就是标准的浮点数，不会发生NumberFormatException */
        try {
            return new Integer(numString);
        } catch (NumberFormatException e) {
            return new Double(numString);
        }
    }

    /**
     * 获取下一个非空白字符
     *
     * @return 到达末尾后返回0，下一个非空白字符
     * @throws IOException
     */
    private char nextClean() throws IOException {
        for (;;) {
            char c = this.next();
            if (c == 0 || !isWhitespace(c)) {
                return c;
            }
        }
    }

    /**
     * 获取源代码的下一个字符
     *
     * @return 到达末尾后返回0，否则返回源代码的下一个字符
     * @throws IOException
     */
    private char next() throws IOException {
        int c;
        /* 如果回退过，返回回退的字符 */
        if (this.usePrevious) {
            this.usePrevious = false;
            c = this.previous;
        } else {
            /* 否则，读取新的字符 */
            c = this.src.read();
        }
        /* 判断是不是到达结尾 */
        if (c <= 0) { // 到达末尾
            this.eof = true;
            return 0;
        }
        this.incrementIndexes(c);
        this.previous = (char) c;
        return this.previous;
    }

    /**
     * 增加{@link #index}并处理换行，此方法仅应当在{@link #next()}中调用
     *
     * @param c 读入的字符
     */
    private void incrementIndexes(int c) {
        if (c > 0) {
            this.index++;
            /* 因为运行系统不同，行尾序列会有'\n' (*inx, Mac OS 10+), '\r\n' (Windows), '\r' (Mac OS 9-)，需做处理*/
            if (c == '\r') {
                this.line++;
                this.characterPreviousLine = this.character;
                this.character = 0;
            } else if (c == '\n') {
                if (this.previous != '\r') {
                    this.line++;
                    this.characterPreviousLine = this.character;
                }
                this.character = 0;
            } else {
                this.character++;
            }
        }
    }

    /**
     * 回退一个字符，用于实现超前扫描。
     *
     * @throws IOException 如果已经在字符串起始，或者已经回退过一次
     */
    public void back() throws IOException {
        if (this.usePrevious) {
            throw new IOException("回退两次不允许");
        } else if (this.index <= 0) {
            throw new IOException("已经在开头位置，无法回退");
        }
        this.decrementIndexes();
        this.usePrevious = true;
        this.eof = false;
    }

    /**
     * 回退{@link #index}并处理换行，此方法仅应当在{@link #back()}中调用
     */
    private void decrementIndexes() {
        this.index--;
        if (this.previous == '\r' || this.previous == '\n') {
            this.line--;
            this.character = this.characterPreviousLine;
        } else if (this.character > 0) {
            this.character--;
        }
    }

    /**
     * 进行词法分析，返回分析列表
     *
     * @return 单词分析结果的列表
     * @throws IOException
     */
    public List<WordExp> analysis() throws IOException {
        this.hasAnalysised = true;
        List<WordExp> res = new ArrayList<>();
        int c;
        /* c为0表示已经分析到结尾，采用 nextClean 方法略过空白字符 */
        while ((c = nextClean()) != 0) {
            if (isLetter(c)) {
                this.back();
                this.analysisString(res); /* 处理字母开头 */
            } else if (isDigit(c)) {
                this.back();
                this.analysisNumber(res); /* 处理数字开头 */
            } else {
                this.back();
                this.analysisOther(res); /* 处理其他字符开头 */
            }
        }
        return res;
    }

    /**
     * 分析以字母开头的符号串
     *
     * @param res 分析结果列表
     * @throws IOException
     */
    private void analysisString(List<WordExp> res) throws IOException {
        /* 读出一个以字母开头，之后含有字母和数字零次或多次的字符串 */
        String cur = this.nextString();
        /* 如果读出的字符串为关键词 */
        if (keywordsSet.contains(cur)) {
            res.add(new WordExp(cur, WordType.KEYWORD, this.line, this.character));
        } else if (identifiers.contains(cur)) {
            /* 如果读出的字符串已经存在在标识符列表中 */
            res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
        } else {
            /* 如果不存在在标识符列表中，假如标识符列表 */
            identifiers.add(cur);
            res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
        }
    }

    /**
     * 分析以数字开头的符号串
     *
     * @param res 分析结果列表
     * @throws IOException
     */
    private void analysisNumber(List<WordExp> res) throws IOException {
        /* 读出一个以数字开头，之后含有数字和.零次或多次的字符串 */
        String numString = this.nextNumberString();
        /* 超前扫描下一个字符 */
        char c = this.next();
        if (!isLetter(c)) {
            if (c != 0)
                this.back();
            /* 如果以.结尾，表示出现了123.这种错误 */
            if (numString.endsWith(".")) {
                res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
                return;
            }
            /* 如果字符串含有两个及以上的.，说明出现了类似123.123.123这种错误 */
            else if (numString.indexOf('.') != numString.lastIndexOf('.')) {
                res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
                return;
            }
            /* 否则，是正常的常数串 */
            constants.add(parseNumber(numString));
            res.add(new WordExp(numString, WordType.CONSTANT, this.line, this.character));
            return;
        }
        /* 如果下一个字符是字母的话，说明会出现形如123abc这种类似的错误*/
        this.back();
        /* 读取出后面的那个部分字符串 */
        String cur = this.nextString();
        cur = numString + cur;
        res.add(new WordExp(cur, WordType.ERROR, this.line, this.character));
    }

    /**
     * 分析以非字母和数字开头的符号串
     *
     * @param res 分析结果列表
     * @throws IOException
     */
    private void analysisOther(List<WordExp> res) throws IOException {
        char c = this.next();
        char c2;
        if (c == '/') {
            /* 如果以/开头，表示可能是注释，除法符号 */
            c2 = this.next();
            /* 如果下一个字符是/，表示为注释，而且注释到下一个回车或者结束，不用记录 */
            if (c2 == '/') {
                for (;;) {
                    c2 = this.next();
                    if (c2 == '\r' || c2 == '\n' || c2 == '0')
                        return;
                }
                /* 如果下一个字符是*，表示为注释，注释到下一个*和/，不用记录 */
            } else if (c2 == '*') {
                StringBuilder con = new StringBuilder("/*");
                for (;;) {
                    c2 = this.next();
                    con.append(c2);
                    /* 一直读取，直到*和/ */
                    if (c2 == '*') {
                        c2 = this.next();
                        con.append(c2);
                        if (c2 == '/')
                            return;
                        else
                            continue;
                        /* 但是如果未找到匹配的*和/就达到了结尾，表示注释出错，要添加记录 */
                    } else if (c2 == 0) {
                        res.add(new WordExp(con.toString(), WordType.ERROR, this.line, this.character));
                        return;
                    }
                }
                /* 如果不是/和*，表示是除法符号 */
            } else {
                res.add(new WordExp("/", WordType.OPERATOR_A, this.line, this.character));
                return;
            }
            /* 如果读取到的符号是分隔符 */
        } else if (delimitersSet.contains(c)) {
            res.add(new WordExp(c + "", WordType.DELIMITER, this.line, this.character));
            return;
        }
        /* 如果读取到的符号是算数运算符 */
        else if (operatorsASet.contains(c)) {
            res.add(new WordExp(c + "", WordType.OPERATOR_A, this.line, this.character));
            return;
            /* 如果是逻辑运算符，因为逻辑运算符包含<=和>=以及<>，也要超前搜索 */
        } else if (c == '<') {
            c2 = this.next();
            /* 如果是<> */
            if (c2 == '>') {
                res.add(new WordExp("<>", WordType.OPERATOR_L, this.line, this.character));
                return;
                /* 如果是<= */
            } else if (c2 == '=') {
                res.add(new WordExp("<=", WordType.OPERATOR_L, this.line, this.character));
                return;
            } else {
                /* 如果是< */
                this.back();
                res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
                return;
            }
        } else if (c == '>') {
            c2 = this.next();
            /* 如果是>= */
            if (c2 == '=') {
                res.add(new WordExp(">=", WordType.OPERATOR_L, this.line, this.character));
                return;
                /** 如果是> */
            } else {
                this.back();
                res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
                return;
            }
            /* 如果是= */
        } else if (c == '=') {
            res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
            return;
            /* 如果什么也不是，出错 */
        } else {
            res.add(new WordExp(c + "", WordType.ERROR, this.line, this.character));
            return;
        }
    }

    public static void main(String[] args) throws IOException {
        Scanner scanner = new Scanner(System.in);
        scanner.useDelimiter("(\r\n\r\n)|(\n\n)|(\r\r)");
        String src = scanner.next();
        Tokenizer tokenizer = new Tokenizer(src);
        List<WordExp> wordExps = tokenizer.analysis();
        // System.out.println(wordExps);
        System.out.printf("%-16s%-16s%-16s%-16s", "单词", "二元序列", "类 型", "位置（行，列）");
        System.out.println();
        System.out.printf("%-16s(单词种别，单词属性)", "");
        System.out.println();
        for (WordExp word : wordExps) {
            if (word.type != WordType.ERROR) {
                System.out.printf("%-16s%-16s%-16s%-16s", word.word, "(" + word.type.id + ", " + word.word + ")",
                        word.type.type, "(" + word.line + ", " + word.column + ")");
                System.out.println();
            } else {
                System.out.printf("%-16s%-16s%-16s%-16s", word.word, "Error", "Error",
                        "(" + word.line + ", " + word.column + ")");
                System.out.println();
            }
        }
        scanner.close();
    }

    @Override
    public String toString() {
        return " at " + this.index + " [row " + this.line + " column " + this.character + "]";
    }
}

enum WordType {
    /** 关键词 */
    KEYWORD(1, "关键词"),
    /** 分隔符 */
    DELIMITER(2, "分隔符"),
    /** 算数运算符 */
    OPERATOR_A(3, "算术运算符"),
    /** 关系运算符 */
    OPERATOR_L(4, "关系运算符"),
    /** 常量 */
    CONSTANT(5, "常量"),
    /** 标识符 */
    IDENTIFIER(6, "标识符"),
    /** 错误 */
    ERROR(7, "错误");

    public int id;
    public String type;

    private WordType(int id, String type) {
        this.id = id;
        this.type = type;
    }

    @Override
    public String toString() {
        return this.type;
    }
};

final class WordExp {
    /** 单词 */
    final String word;
    /** 单词类型 */
    final WordType type;
    /** 单词所处的行 */
    final int line;
    /** 单词所处的列 */
    final int column;

    public WordExp(String word, WordType type, int line, int column) {
        this.word = word;
        this.type = type;
        this.line = line;
        this.column = column;
    }

    @Override
    public String toString() {
        return type.toString() + word + "，出现在 " + " (" + this.line + "," + this.column + ")";
    }
}
编译原理实验一 词法分析设计

一、实验目的

二、实验内容

三、程序描述

友情链接更多精彩内容

编译原理实验一词法分析设计