物联网设备的数据常常是一些结构体数据,即将结构体对象转byte数组,我们实现了元数据动态解析这些结构体。然后遇到了一个问题,每个结构体的字段信息很多,填写元数据成了一个很痛苦的事情。针对这个问题,我尝试使用解析C语言头文件,获取结构体的信息,减少人工的操作。
下面涉及一部分编译原理的内容,但是不需要学习过编译原理的知识。
以下代码放到GitHub上
geange/cstruct-gogithub.com词法分析(lexical analysis)
将头文件的数据转化成Token。
typedef struct
{
s32 Longitude;
s32 Latitude;
s16 Height;
u8 Speed;
u8 Type;
u8 HeadType;
u8 HeightType;
u16 Reserved2;
u8 Param0;
u8 Param1;
u8 Param2;
u8 Param3;
u32 Reserved3;
} MSG_Waypoint_t;
上面是一个C语言结构体,token可以看作'typedef','struct','s32',';','}'等等。(其实这部分是有一些区别的,学过编译原理的应该明白,但是没学过可以先不理会,个人觉得影响不大)。
如果将这一个结构体拆分成一个个Token,要怎么做,没错,就是遍历。。因为头文件的语法超级简单,简单遍历即可(我目前没有对注释信息做处理,所以默认是没有注释的)。
定义一个Scanner接口
type Scanner interface {
// 格式化,清除注释(我暂时没有写)
Format() error
// 获取当前位置的字符
Fetch() (byte, error)
// 获取当前位置的字符,并且游标指向下一个字符
Next() (byte, error)
// 扫描数值类型
ScanDigit() (Token, error)
// 扫描字符类型
ScanLetter() (Token, error)
// 扫描全部
Scan() ([]Token, error)
}
func NewScanner(text []byte) Scanner {
return &scanner{
bs: text,
index: 0,
}
}
func NewScannerString(text string) Scanner {
return &scanner{
bs: []byte(text),
index: 0,
}
}
type scanner struct {
bs []byte
index int
}
func (s *scanner) Format() error {
panic("implement me")
}
func (s *scanner) Fetch() (byte, error) {
if s.index >= len(s.bs) {
return 0, io.EOF
}
return s.bs[s.index], nil
}
func (s *scanner) Next() (byte, error) {
if s.index >= len(s.bs) {
return 0, io.EOF
}
r := s.bs[s.index]
s.index++
return r, nil
}
func (s *scanner) ScanDigit() (Token, error) {
buf := make([]byte, 0, 10)
numFlag := false
pointFlag := false
for {
word, err := s.Fetch()
if err != nil {
return nil, err
}
if isDigit(word) {
if !numFlag {
numFlag = true
}
buf = append(buf, word)
s.Next()
continue
}
if word == '.' {
if !pointFlag {
pointFlag = true
buf = append(buf, word)
s.Next()
continue
} else {
return nil, errors.New("float number has not only one point")
}
}
// end of digit
break
}
if pointFlag {
num, err := strconv.ParseFloat(string(buf), 64)
if err != nil {
return nil, err
}
return &DigitToken{
ttype: TFloatingPoint,
floatValue: num,
}, nil
}
num, err := strconv.Atoi(string(buf))
if err != nil {
return nil, err
}
return &DigitToken{
ttype: TInteger,
intValue: num,
}, nil
}
func (s *scanner) ScanLetter() (Token, error) {
buf := make([]byte, 0, 5)
letterFlag := false
for {
b, err := s.Fetch()
if err != nil {
return nil, err
}
if isLetter(b) {
letterFlag = true
buf = append(buf, b)
s.Next()
continue
}
if isDigit(b) {
if !letterFlag {
return nil, errors.New("number not allow to be use as the first of the word")
}
buf = append(buf, b)
s.Next()
continue
}
// end of digit
break
}
word := string(buf)
if IN(word, ReserveWords...) {
switch word {
case "s64":
return &ReservedWordToken{ttype: TInt64}, nil
case "s32":
return &ReservedWordToken{ttype: TInt32}, nil
case "s16":
return &ReservedWordToken{ttype: TInt16}, nil
case "s8":
return &ReservedWordToken{ttype: TInt8}, nil
case "u64":
return &ReservedWordToken{ttype: TUint64}, nil
case "u32":
return &ReservedWordToken{ttype: TUint32}, nil
case "u16":
return &ReservedWordToken{ttype: TUint16}, nil
case "u8":
return &ReservedWordToken{ttype: TUint8}, nil
case "float":
return &ReservedWordToken{ttype: TFloat32}, nil
case "double":
return &ReservedWordToken{ttype: TFloat64}, nil
case "type":
return &ReservedWordToken{ttype: TType}, nil
case "typedef":
return &ReservedWordToken{ttype: TTypedef}, nil
case "struct":
return &ReservedWordToken{ttype: TStruct}, nil
case "byte":
return &ReservedWordToken{ttype: TByte}, nil
}
}
return &LetterToken{ttype: TString, value: word}, nil
}
func (s *scanner) Scan() ([]Token, error) {
tokens := make([]Token, 0, 10)
for {
b, err := s.Fetch()
if err != nil {
if err == io.EOF {
break
}
}
switch {
case isLetter(b):
token, err := s.ScanLetter()
if err != nil {
return nil, err
}
tokens = append(tokens, token)
case isDigit(b):
token, err := s.ScanDigit()
if err != nil {
return nil, err
}
tokens = append(tokens, token)
case b == ' ' || b == 'n' || b == 't':
s.Next()
continue
case b == ';':
s.Next()
token := &DelimiterToken{ttype: TLineEnd}
tokens = append(tokens, token)
case b == '[':
s.Next()
token := &DelimiterToken{ttype: TLeftBracket}
tokens = append(tokens, token)
case b == ']':
s.Next()
token := &DelimiterToken{ttype: TRightBracket}
tokens = append(tokens, token)
case b == '{':
s.Next()
token := &DelimiterToken{ttype: TLeftBrace}
tokens = append(tokens, token)
case b == '}':
s.Next()
token := &DelimiterToken{ttype: TRightBrace}
tokens = append(tokens, token)
}
}
return tokens, nil
}
自此获取了全部的Token,下面可以解析结构体了。
这部分理解很简单,当获取一个Token是'i32'时,获取下一个token是该字段的名字,这个时候就需要分2钟情况处理。继续获取Token,如果是分号,说明这不是数组类型,如果获取是中括号就是数组类型了。内嵌结构体的话需要在全局Map中记录结构体的信息,当遇到一个类型非系统预留字(不是i32/i16/long/int这种),那检查是否已经有结构体解析结果,没有解析结果就报错。
type Lexer interface {
// 获取当前token
Fetch() (Token, error)
// 获取当前Token,游标指向下一个Token
Next() (Token, error)
Index(n int) (Token, error)
CurrentIndex() int
// 解析声明
Statement() ([]Field, error)
// 解析单个结构体
Structure() (*CStruct, error)
// 解析整个头文件信息
AllStructure() ([]*CStruct, error)
// 是否存在结构体
ExistStructure(name string) (*CStruct, bool)
}
type lexer struct {
tokens []Token
index int
structures map[string]CStruct
}
func (l *lexer) Fetch() (Token, error) {
if l.index >= len(l.tokens) {
return nil, io.EOF
}
token := l.tokens[l.index]
return token, nil
}
func (l *lexer) Next() (Token, error) {
if l.index >= len(l.tokens) {
return nil, io.EOF
}
token := l.tokens[l.index]
l.index++
return token, nil
}
func (l *lexer) Index(n int) (Token, error) {
if n >= len(l.tokens) {
return nil, io.EOF
}
token := l.tokens[n]
return token, nil
}
func (l *lexer) CurrentIndex() int {
return l.index
}
func (l *lexer) Statement() ([]Field, error) {
result := make([]Field, 0)
dataTypeToken, err := l.Next()
if err != nil {
return nil, err
}
nameToken, err := l.Next()
if err != nil {
return nil, err
}
tToken, err := l.Fetch()
if err != nil {
return nil, err
}
var dType FieldType
switch dataTypeToken.Type() {
case TInt64:
dType = Int64
case TInt32:
dType = Int32
case TInt16:
dType = Int16
case TInt8:
dType = Int8
case TUint64:
dType = UInt64
case TUint32:
dType = UInt32
case TUint16:
dType = UInt16
case TUint8:
dType = UInt8
case TFloat32:
dType = Float32
case TFloat64:
dType = Float64
case TByte:
dType = Hex
}
// 基础类型
if tToken.Type() == TLineEnd {
name := nameToken.Value().(string)
switch dataTypeToken.Type() {
case TString:
// 内嵌子集
structName := dataTypeToken.Value().(string)
cStruct, ok := l.ExistStructure(structName)
if !ok {
return nil, errors.New(fmt.Sprintf("%s struct not defined", structName))
}
fs, err := cStruct.ToStatement()
if err != nil {
return nil, errors.Wrap(err, "get statement from sub struct")
}
for i := range fs {
fs[i].Name = fmt.Sprintf("%s_%s", name, fs[i].Name)
}
result = append(result, fs...)
default:
field := Field{
Name: name,
Type: dType,
Size: getFieldTypeSize(dType),
}
result = append(result, field)
}
_, _ = l.Next()
return result, nil
}
switch dataTypeToken.Type() {
case TFloat64, TInt64, TInt32, TInt16, TInt8, TUint64, TUint32, TUint16, TUint8, TByte:
if tToken.Type() == TLeftBracket {
fs, err := l.arrayStatement(dataTypeToken, nameToken, nil, false)
if err != nil {
return nil, err
}
result = append(result, fs...)
}
case TString:
structName := dataTypeToken.Value().(string)
if tToken.Type() == TLeftBracket {
if cStruct, ok := l.ExistStructure(structName); ok {
fs, err := l.arrayStatement(dataTypeToken, nameToken, cStruct, true)
if err != nil {
return nil, err
}
result = append(result, fs...)
}
}
return nil, errors.New(fmt.Sprintf("%s struct not defined", structName))
}
return result, nil
}
func (l *lexer) Structure() (*CStruct, error) {
token, err := l.Next()
if err != nil {
return nil, err
}
switch token.Type() {
case TTypedef:
stToken, err := l.Next()
if err != nil {
return nil, err
}
if stToken.Type() != TStruct {
return nil, errors.New("'struct' not found after typedef")
}
bToken, err := l.Fetch()
if err != nil {
return nil, err
}
if bToken.Type() != TLeftBrace {
return nil, errors.New("'{' not found after typedef struct")
}
fs, err := l.structureInBrace()
if err != nil {
return nil, err
}
nameToken, err := l.Next()
if err != nil {
return nil, err
}
if nameToken.Type() != TString {
return nil, errors.New("structure type not found")
}
name := nameToken.Value().(string)
cStruct := CStruct{
Name: name,
FieldSet: FieldSet(fs),
}
endToken, err := l.Next()
if err != nil {
return nil, errors.Wrap(err, "';' bot found after struct defined")
}
if endToken.Type() != TLineEnd {
return nil, errors.Wrap(err, "';' bot found after struct defined")
}
return &cStruct, nil
case TStruct:
nameToken, err := l.Next()
if err != nil {
return nil, err
}
if nameToken.Type() != TString {
return nil, errors.New("structure type not found")
}
fs, err := l.structureInBrace()
if err != nil {
return nil, err
}
name := nameToken.Value().(string)
cStruct := CStruct{
Name: name,
FieldSet: FieldSet(fs),
}
endToken, err := l.Next()
if err != nil {
return nil, errors.Wrap(err, "';' bot found after struct defined")
}
if endToken.Type() != TLineEnd {
return nil, errors.Wrap(err, "';' bot found after struct defined")
}
return &cStruct, nil
}
return nil, errors.New("struct format error")
}
func (l *lexer) AllStructure() ([]*CStruct, error) {
result := make([]*CStruct, 0)
for {
cStruct, err := l.Structure()
if err != nil {
return nil, err
}
result = append(result, cStruct)
l.structures[cStruct.Name] = *cStruct
_, err = l.Fetch()
if err == io.EOF {
break
}
}
return result, nil
}
func (l *lexer) ExistStructure(name string) (*CStruct, bool) {
c, ok := l.structures[name]
return &c, ok
}
func (l *lexer) structureInBrace() ([]Field, error) {
token, err := l.Next()
if err != nil {
return nil, err
}
if token.Type() != TLeftBrace {
return nil, errors.New("'{' not found")
}
result := make([]Field, 0)
for {
fs, err := l.Statement()
if err != nil {
return nil, err
}
result = append(result, fs...)
token, err := l.Fetch()
if err != nil {
return nil, err
}
if token.Type() == TRightBrace {
_, _ = l.Next()
break
}
}
return result, nil
}
虽然也不指望各位能看完。。。