开篇
所谓词法分析,就是将源代码按照构词规则分解成一系列单词符号。单词是语言中具有独立意义的最小单位,包括关键字、标识符、运算符、界符和常量等。
分析
源程序分解的单词可以归为以下几类:
1. 关键字
c语言关键字有不少,这里只列举其中一部分:
for,while,do,continue,if,else,
char,int,double,return
2. 变量名
变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为
letter = {a|b|...|z|A|B|...|Z}
digit = {0|1|...|9}
id = (letter|_ )(letter| _| digit)*
转化为NFA图为:
3. 数字常量
变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为
digit = {0|1|...|9}
id = digit((digit)*|.(digit)*)(ℇ | (E|e)digit(digit)*)
转化为NFA图为:
4. 界符和运算符
()[]{};+-*/等等
本人这里只加入了+-和=,因为*/以及&^%等都是类似的,这里只是试验,不需要全部写。
思路
根据有限自动机的概念,将整个源代码分解的规则转化为一个一个状态,用switch或if/else将状态之间的转移描述出来即可。这里省略了将常量以及变量名插入到符号表返回指针的那一步。
简单一个伪代码描述这个过程
初始化 状态=0
switch(状态):
{
case 0:
接收源码
if(符合1规则)
处理,状态=1
else if(符合2规则)
处理,状态=2
else
...
case 1:
case 2:
case ...
}
代码
#include <iostream>
#include <algorithm>
#include <vector>
#include <stack>
#include <cstdlib>
#include <cstdio>
using namespace std;
const int KEY_NUM = 11;
const char* KEY_SET[] = {
"for",
"while",
"do",
"continue",
"if",
"else",
"char",
"int",
"double",
"float",
"return"
};
const char* FILE_NAME = "/Users/shiyi/abc.c";
char strBuffer[1026];
char strBox[200];
void retract(char* &str)
{
str--;
}
char getChar(char* &str)
{
return *str++;
}
bool isNum(char ch)
{
if(ch >= '0' && ch <= '9')
return true;
return false;
}
bool isLetter(char ch)
{
if(ch >= 'a' && ch <= 'z')
return true;
if(ch >= 'A' && ch <= 'Z')
return true;
return false;
}
int findKey(char* ch)
{
for(int i=0; i<KEY_NUM; i++)
{
if(0 == strcmp(ch, KEY_SET[i]))
return i+1;
}
return 0;
}
void output(char* key, char* value)
{
printf("< %s, %s >\n", key, value);
}
//扫描
void scanner(char* str)
{
int state = 0;
char ch = ' ';
int pos = 0;
while(ch != '\0')
{
switch (state) {
case 0:
{
ch = getChar(str);
switch (ch) {
case ' ':
pos = 0;
break;
case '[':
case ']':
case '(':
case ')':
case '{':
case '}': {
char temp[2];
temp[0] = ch;
output(temp, "-");
pos = 0;
break;
}
case '\'': {
state = 0;
while((ch = getChar(str)) != '\'' && ch != '\0')
{
strBox[pos++] = ch;
}
if(ch == '\0')
{
//error
}
strBox[pos] = '\0';
output("string", strBox);
pos = 0;
break;
}
case '"': {
state = 0;
while((ch = getChar(str)) != '"' && ch != '\0')
{
strBox[pos++] = ch;
}
if(ch == '\0')
{
////
output("error", strBox);
}
else
{
strBox[pos] = '\0';
output("string", strBox);
pos = 0;
}
break;
}
case '+': {
state = 0;
ch = getChar(str);
switch (ch) {
case '+':
output("++", "-");
pos = 0;
break;
case '=':
output("+=", "-");
pos = 0;
break;
default:
retract(str);
output("+", "-");
pos = 0;
break;
}
}
case '-': {
state = 0;
ch = getChar(str);
switch (ch) {
case '-':
output("--", "-");
pos = 0;
break;
case '=':
output("-=", "-");
pos = 0;
break;
default:
retract(str);
output("-", "-");
pos = 0;
break;
}
}
case '=': {
state = 0;
ch = getChar(str);
switch (ch) {
case '=':
output("==", "-");
pos = 0;
break;
default:
retract(str);
output("=", "-");
pos = 0;
break;
}
break;
}
default: {
if(isNum(ch))
{
strBox[pos++] = ch;
state = 2;
}
else if(isLetter(ch) || ch == '_')
{
strBox[pos++] = ch;
state = 1;
}
break;
}
}
break;
}
case 1:
{
while(true)
{
ch = getChar(str);
if(isLetter(ch) || isNum(ch) || ch == '_')
strBox[pos++] = ch;
else
{
strBox[pos] = '\0';
int id = findKey(strBox);
if(id == 0)
output("id", strBox);
else
output("key", strBox);
retract(str);
pos = 0;
state = 0;
break;
}
}
break;
}
case 2:
{
while(true)
{
ch = getChar(str);
if(isNum(ch))
strBox[pos++] = ch;
else if(ch == '.')
{
strBox[pos++] = ch;
state = 3;
break;
}
else if(ch == 'E' || ch == 'e')
{
strBox[pos++] = ch;
state = 4;
break;
}
else
{
strBox[pos] = '\0';
output("number", strBox);
retract(str);
pos = 0;
state = 0;
break;
}
}
break;
}
case 3:
{
while(true)
{
ch = getChar(str);
if(isNum(ch))
strBox[pos++] = ch;
else if(ch == 'E' || ch == 'e')
{
strBox[pos++] = ch;
state = 4;
}
else
{
strBox[pos] = '\0';
output("number", strBox);
retract(str);
pos = 0;
state = 0;
break;
}
}
break;
}
case 4:
{
while(true)
{
ch = getChar(str);
if(isNum(ch))
strBox[pos++] = ch;
else
{
strBox[pos] = '\0';
output("number", strBox);
retract(str);
pos = 0;
state = 0;
break;
}
}
break;
}
default:
////
output("error", strBox);
break;
}
}
}
int main()
{
FILE* file = fopen(FILE_NAME, "r");
while(NULL != fgets(strBuffer, 1024, file))
{
scanner(strBuffer);
}
return 0;
}
测试结果
源程序
int main()
{
int a = 5;
for(int i=0; i<5; i++)
{
a++;
}
return 0;
}
输出结果
< key, int >
< id, main >
< (, - >
< ), - >
< {, - >
< key, int >
< id, a >
< =, - >
< number, 5 >
< key, for >
< (, - >
< key, int >
< id, i >
< =, - >
< number, 0 >
< id, i >
< number, 5 >
< id, i >
< ++, - >
< -, - >
< =, - >
< ), - >
< {, - >
< id, a >
< ++, - >
< -, - >
< =, - >
< }, - >
< key, return >
< number, 0 >
< }, - >
Program ended with exit code: 0