簡單的C語言編譯器--詞法分析器


1. 定義詞法單元Tag

  首先要將可能出現的詞進行分類,可以有不同的分類方式。如多符一類:將所有逗號、分號、括號等都歸為一類,或者一符一類,將一個符號歸為一類。我這里采用的是一符一類的方式。C代碼如下:

    #ifndef TAG_H
    #define TAG_H
    
    namespace Tag {
    	//保留字
    	const int
    		INT = 1, BOOL = 2, MAIN = 3, IF = 4,
    		ELSE = 5, FOR = 6, WHILE = 7, FALSE = 8,
    		BREAK = 9, RETURN = 10, TRUE = 11 ;	
    
    	//運算符
    	const int
    		NOT = 20, NE = 21, AUTOMINUS =22, MINUS = 23,
    		AUTOADD = 24, ADD = 25, OR = 26, 
    		AND = 27, MUTIPLY = 28, DIVIDE = 29, MOD = 30,
    		EQ = 31, ASSIN = 32, GE = 33, GT = 34,
    		LE = 35, LS = 36;
    
    	//分界符
    	const int 
    		COMMA = 40, SEMICOLON = 41, LLBRACKET = 42,
    		RLBRACKET = 43, LMBRACKET = 44, RMBRACKET = 45,
    		LGBRACKET = 46, RGBRACKET = 47;
    
    	//整數常數
    	const int NUM = 50;
    
    	//標識符
    	const int ID = 60;
    
    	//錯誤
    	const int ERROR = 404;
    
    	//空
    	const int  EMPTY = 70;
    
    }

#endif

2. 具體步驟

  • 一個一個字符地掃描測試代碼,忽略空白字符,遇到回車時,記錄行數加1
  • 要進行區分標識符(即普通變量名字)和保留字
  • 因為將標識符和常數都guiwe各自歸為一類,所以要有算法能夠識別出一整個常數和完整的標識符
  • 加入適當的非法詞檢測

3. 設計詞法分析類

  設計一個詞法分析器,當然要包括如何存儲一個詞法單元,如何掃描(scan)測試代碼等,直接上代碼:

myLexer.h

    #ifndef MYLEXER_H
    #define MYLEXER_H
    
    #include <fstream>
    #include <string>
    #include <unordered_map>
    #include "tag.h"
    
    
    /*
     * 主要是定義基本的詞法單元類,
     * 聲明了詞法分析類
     */
    
    //存儲詞法單元
    class Word {
    	public:
    		Word() = default;
    		Word(std::string s, int t) : lexeme(s), tag(t) {};
    		std::string getLexeme() { return lexeme; };
    		int getTag() { return tag; }
    		void setTag(int t) { tag = t; }
    		void setLexeme(std::string s) { lexeme = s; }
    	private:
    		std::string lexeme;
    		int tag;
    };
    
    //詞法分析器類
    class Lexer {
    	public:
    		Lexer();
    		void reserve(Word w);
    		bool readnext(char c, std::ifstream &in);
    		Word scan(std::ifstream &in);
    		int getLine() { return line; }
    	private:
    		char peek;
    		std::unordered_map<std::string, Word> words;
    		int line;
    };
    
    
    #endif

myLexer.cpp

    #include <iostream>
    #include <cctype>
    #include <sstream>
    #include "myLexer.h"
    
    void Lexer::reserve(Word w) {
    	words.insert({w.getLexeme(), w});
    }
    
    Lexer::Lexer() {
    	//存入保留字,為了區分標識符
    	reserve( Word("int", Tag::INT) );
    	reserve( Word("bool", Tag::BOOL) );
    	reserve( Word("main", Tag::MAIN) );
    	reserve( Word("if", Tag::IF) );
    	reserve( Word("else", Tag::ELSE) );
    	reserve( Word("for", Tag::FOR) );
    	reserve( Word("while", Tag::WHILE) );
    	reserve( Word("break", Tag::BREAK) );
    	reserve( Word("return", Tag::RETURN) );
    	reserve( Word("true", Tag::TRUE) );
    	reserve( Word("false", Tag::FALSE) );
    	
    	peek = ' ';
    	line = 1;
    
    }
    
    //方便處理像>=,++等這些兩個字符連在一起的運算符
    bool Lexer::readnext(char c, std::ifstream &in) {
    	in >> peek;
    	if( peek != c)
    		return false;
    	peek = ' ';
    	return true;
    }
    
    
    Word Lexer::scan(std::ifstream &in) {
    	//跳過空白符
    	while(!in.eof()) {
    		if(peek == ' ' || peek == '\t') {
    			in >> peek;
    			continue;
    		}
    		else if(peek == '\n')
    			++line;
    		else
    			break;
    		in >> peek;
    	}
    
    	//處理分界符、運算符等
    	switch(peek) {
    		case '!':
    			if(readnext('=', in))
    				return Word("!=", Tag::NE);
    			else
    				return Word("!", Tag::NOT);
    		case '-':
    			if(readnext('-', in))
    				return Word("--", Tag::AUTOMINUS);
    			else
    				return Word("-", Tag::MINUS);
    		case '+':
    			if(readnext('+', in)) 
    				return Word("++", Tag::AUTOADD);
    			else
    				return Word("+", Tag::ADD);
    		case '|':
    			if(readnext('|', in)) 
    				return Word("||", Tag::OR);
    			else
    				return Word("error", Tag::ERROR);
    		case '&':
    			if(readnext('&', in))
    				return Word("&&", Tag::AND);
    			else
    				return Word("error", Tag::ERROR);
    		case '*':
    			in >> peek;
    			return Word("*", Tag::MUTIPLY);
    		case '/':
    			in >> peek;
    			return Word("/", Tag::DIVIDE);
    		case '%':
    			in >> peek;
    			return Word("%", Tag::MOD);
    		case '=':
    			if(readnext('=', in))
    				return Word("==", Tag::EQ);
    			else
    				return Word("=", Tag::ASSIN);
    		case '>':
    			if(readnext('=', in))
    				return Word(">=", Tag::GE);
    			else
    				return Word(">", Tag::GT);
    		case '<':
    			if(readnext('=', in))
    				return Word("<=", Tag::LE);
    			else
    				return Word("<", Tag::LS);
    		case ',':
    			in >> peek;
    			return Word(",", Tag::COMMA);
    		case ';':
    			in >> peek;
    			return Word(";", Tag::SEMICOLON);
    		case '(':
    			in >> peek;
    			return Word("(", Tag::LLBRACKET);
    		case ')':
    			in >> peek;
    			return Word(")", Tag::RLBRACKET);
    		case '[':
    			in >> peek;
    			return Word("[", Tag::LMBRACKET);
    		case ']':
    			in >> peek;
    			return Word("]", Tag::RMBRACKET);
    		case '{':
    			in >> peek;
    			return Word("{", Tag::LGBRACKET);
    		case '}':
    			in >> peek;
    			return Word("}", Tag::RGBRACKET);
    	}
    	
    	//處理常數
    	if(isdigit(peek)) {
    		int v = 0;
    		do {
    			v = 10*v + peek - 48;
    			in >> peek;
    		} while(isdigit(peek));
    		if(peek != '.')
    			return Word(std::to_string(v), Tag::NUM);
    	}	
    
    
    	//處理標識符
    	if(isalpha(peek)) {
    		std::ostringstream b;		
    		do {
    			b << peek;
    			in >> peek;
    		} while(isalnum(peek) || peek == '_');
    
    		std::string tmp = b.str();
    
    		//判斷是否為保留字
    		if(words.find(tmp) != words.end()) 
    			return words[tmp];
    		else
    			return Word(tmp, Tag::ID);
    	}
    	if(peek != ' ' && peek != '\t' && peek != '\n')	
    		return Word("error", Tag::ERROR);
    	return Word("empty", Tag::EMPTY);
    }

  設計完成后,自己寫一個Main函數,在while循環中調用scan函數,每次打印出Word內容,就能夠得到


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM