前不久编译原理学习了词法分析,自己实现了一个简单的c语言词法分析器,来加深对词法分析器原理,状态转换图,有限自动机的理解。
当我们想在电脑上运行一个c语言程序时,都要将源程序进行编译。编译简单来说就是将一个源程序转换为另一种形式的程序的过程。而编译中的词法分析就是将你所输入的各种元素按照一种规则分解整理成各种单词符号,包括关键字,标识符,运算符等。
在这个词法分析器中我把不同的单词符号分为了界符,运算符,数字,标识符,关键字五大类。
1、界符:左右中括号(‘[‘ 、‘]’) ,左右小括号(‘(‘、’)’), 分号(’;’), 空格(‘ ‘),逗号(‘,’), 大括号(‘{‘、’}’), 双引号(‘ “ ’),井号(‘#’)。
2、运算符:+,-, *, /, +=, ++, --, *=, -=, /=, ==, &, &&, |, ||
3、数字:所有数字组成的集合。
4、标识符:以下划线或数字为开头,其余由数字,下划线,字母任意组成的集合。
5、关键字:c语言本身定义的常用关键字集合。包括(int, long, float, double, short等等)。
根据有限自动机的概念用状态转换图将系统状态的转换和系统状态转换的事件表示出来。其中每一个节点代表一个状态。双圈代表终结状态。
我对每次从文件中读入的内容进行逐个字符扫描,根据读取的单词符号的不同进行判断将其转换为不同的状态,实现其在不同的状态间的跳转。
程序中总共使用了五个方法函数:
//判断接收的字符是否是数字
int isNum( char );
//判断接收的字符是否是字母
Int isLetter(char);
//读取指针所指的字符内容
char getLetter(char *);
//打印输出信息
void Print(char *, char *);
//实现有限自动机的功能
void Judge(char *);
主要代码如下:
void Judge(char *s){
char ch = NULL ;
char stringbuffer[100];
int state = 0;
int i = 0;
while(ch != '\n'){
switch(state){
case 0:
ch = getLetter(s++);
switch(ch){
case '[':
case ']':
case '(':
case ')':
case ';':
case ',':
case '{':
case '}':
case '#':
case '<':
case '>':
case '.':
case '"':
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer, "界符");
state = 0;
i = 0;
break;
case ' ':
state = 0;
break;
case '\t':
state = 0;
break;
case '+':
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '+' && ch != '='){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '-' :
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '-' && ch != '='){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '=' :
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '='){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '&':
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '&'){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '|':
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '|'){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '*':
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '='){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
Print(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
case '/':
stringbuffer[i++] = ch;
ch = getLetter(s++);
if(ch != '='){
s--;
stringbuffer[i] = '\0';
i = 0;
Print(stringbuffer, "运算符");
}else{
stringbuffer[i++] = ch;
stringbuffer[i] = '\0';
printf(stringbuffer,"运算符");
i = 0;
state = 0;
}
break;
default:
if((ch == '_') || (isLetter(ch == 0))){
stringbuffer[i++] = ch;
state = 1;
}else if(isNum(ch) == 0){
stringbuffer[i++] = ch;
state = 2;
}else{
printf("输入不符合规范!");
}
}
break;
case 1: //判断字母
ch = getLetter(s++);
if((ch == '_') || (isNum(ch) == 0) || (isLetter(ch) == 0)){
stringbuffer[i++] = ch;
state = 1;
}else{
s--;
stringbuffer[i] = '\0';
i = 0;
int j = 0;
for(j = 0; j < length; j++){
if((strcmp(stringbuffer, keyword[j] )) == 0){
Print(stringbuffer, "关键字");
state = 0;
break;
}else{
Print(stringbuffer, "标识符");
state = 0;
break;
}
}
}
break;
case 2: //判断数字
ch = getLetter(s++);
if(isNum(ch) == 0){
stringbuffer[i++] = ch;
}else if(ch == '.'){
stringbuffer[i++] = ch;
state = 3;
}else if(ch == 'E' || ch == 'e'){
stringbuffer[i++] = ch;
state = 4;
}else{
s--;
stringbuffer[i] = '\0';
Print(stringbuffer,"数字");
i = 0;
}
break;
case 3:
ch = getLetter(s++);
if(isNum(ch) == 0){
stringbuffer[i++] = ch;
state = 5;
}else{
printf("输入格式有错!");
}
break;
case 4:
ch = getLetter(s++);
if(isNum(ch) == 0){
stringbuffer[i++] = ch;
}else{
s--;
stringbuffer[i] = '\0';
Print(stringbuffer,"数字");
i = 0;
}
break;
case 5:
ch = getLetter(s++);
if(isNum(ch) == 0){
stringbuffer[i++] = ch;
}else if(ch == 'E' || ch == 'e'){
stringbuffer[i++] = ch;
state = 4;
}else{
s--;
stringbuffer[i] = '\0';
Print(stringbuffer, "数字");
i = 0;
}
break;
}
}
}
测试结果:
源程序:
结果如下: