物联网设备的数据常常是一些结构体数据,即将结构体对象转byte数组,我们实现了元数据动态解析这些结构体。然后遇到了一个问题,每个结构体的字段信息很多,填写元数据成了一个很痛苦的事情。针对这个问题,我尝试使用解析C语言头文件,获取结构体的信息,减少人工的操作。

下面涉及一部分编译原理的内容,但是不需要学习过编译原理的知识。

以下代码放到GitHub上

geange/cstruct-go​github.com
98fe662591f27588446e01f4a920f9a9.png

词法分析(lexical analysis)

将头文件的数据转化成Token。
typedef struct
{
    s32 Longitude;
    s32 Latitude;
    s16 Height;
    u8  Speed;
    u8  Type;
    u8  HeadType;
    u8  HeightType;
    u16 Reserved2;
    u8  Param0;
    u8  Param1;
    u8  Param2;
    u8  Param3;
    u32 Reserved3;
} MSG_Waypoint_t;

上面是一个C语言结构体,token可以看作'typedef','struct','s32',';','}'等等。(其实这部分是有一些区别的,学过编译原理的应该明白,但是没学过可以先不理会,个人觉得影响不大)。

如果将这一个结构体拆分成一个个Token,要怎么做,没错,就是遍历。。因为头文件的语法超级简单,简单遍历即可(我目前没有对注释信息做处理,所以默认是没有注释的)。

定义一个Scanner接口

type Scanner interface {
	// 格式化,清除注释(我暂时没有写)
	Format() error
	// 获取当前位置的字符
	Fetch() (byte, error)
	// 获取当前位置的字符,并且游标指向下一个字符
	Next() (byte, error)
	// 扫描数值类型
	ScanDigit() (Token, error)
	// 扫描字符类型
	ScanLetter() (Token, error)
	// 扫描全部
	Scan() ([]Token, error)
}

func NewScanner(text []byte) Scanner {
	return &scanner{
		bs:    text,
		index: 0,
	}
}

func NewScannerString(text string) Scanner {
	return &scanner{
		bs:    []byte(text),
		index: 0,
	}
}

type scanner struct {
	bs    []byte
	index int
}

func (s *scanner) Format() error {
	panic("implement me")
}

func (s *scanner) Fetch() (byte, error) {
	if s.index >= len(s.bs) {
		return 0, io.EOF
	}
	return s.bs[s.index], nil
}

func (s *scanner) Next() (byte, error) {
	if s.index >= len(s.bs) {
		return 0, io.EOF
	}

	r := s.bs[s.index]
	s.index++
	return r, nil
}

func (s *scanner) ScanDigit() (Token, error) {
	buf := make([]byte, 0, 10)

	numFlag := false
	pointFlag := false

	for {
		word, err := s.Fetch()
		if err != nil {
			return nil, err
		}
		if isDigit(word) {
			if !numFlag {
				numFlag = true
			}
			buf = append(buf, word)
			s.Next()
			continue
		}

		if word == '.' {
			if !pointFlag {
				pointFlag = true
				buf = append(buf, word)
				s.Next()
				continue
			} else {
				return nil, errors.New("float number has not only one point")
			}
		}

		// end of digit
		break
	}

	if pointFlag {
		num, err := strconv.ParseFloat(string(buf), 64)
		if err != nil {
			return nil, err
		}
		return &DigitToken{
			ttype:      TFloatingPoint,
			floatValue: num,
		}, nil
	}

	num, err := strconv.Atoi(string(buf))
	if err != nil {
		return nil, err
	}
	return &DigitToken{
		ttype:    TInteger,
		intValue: num,
	}, nil
}

func (s *scanner) ScanLetter() (Token, error) {
	buf := make([]byte, 0, 5)

	letterFlag := false

	for {
		b, err := s.Fetch()
		if err != nil {
			return nil, err
		}

		if isLetter(b) {
			letterFlag = true
			buf = append(buf, b)
			s.Next()
			continue
		}

		if isDigit(b) {
			if !letterFlag {
				return nil, errors.New("number not allow to be use as the first of the word")
			}
			buf = append(buf, b)
			s.Next()
			continue
		}
		// end of digit
		break
	}

	word := string(buf)
	if IN(word, ReserveWords...) {
		switch word {
		case "s64":
			return &ReservedWordToken{ttype: TInt64}, nil
		case "s32":
			return &ReservedWordToken{ttype: TInt32}, nil
		case "s16":
			return &ReservedWordToken{ttype: TInt16}, nil
		case "s8":
			return &ReservedWordToken{ttype: TInt8}, nil
		case "u64":
			return &ReservedWordToken{ttype: TUint64}, nil
		case "u32":
			return &ReservedWordToken{ttype: TUint32}, nil
		case "u16":
			return &ReservedWordToken{ttype: TUint16}, nil
		case "u8":
			return &ReservedWordToken{ttype: TUint8}, nil
		case "float":
			return &ReservedWordToken{ttype: TFloat32}, nil
		case "double":
			return &ReservedWordToken{ttype: TFloat64}, nil
		case "type":
			return &ReservedWordToken{ttype: TType}, nil
		case "typedef":
			return &ReservedWordToken{ttype: TTypedef}, nil
		case "struct":
			return &ReservedWordToken{ttype: TStruct}, nil
		case "byte":
			return &ReservedWordToken{ttype: TByte}, nil
		}
	}
	return &LetterToken{ttype: TString, value: word}, nil
}

func (s *scanner) Scan() ([]Token, error) {
	tokens := make([]Token, 0, 10)
	for {
		b, err := s.Fetch()
		if err != nil {
			if err == io.EOF {
				break
			}
		}

		switch {
		case isLetter(b):
			token, err := s.ScanLetter()
			if err != nil {
				return nil, err
			}
			tokens = append(tokens, token)
		case isDigit(b):
			token, err := s.ScanDigit()
			if err != nil {
				return nil, err
			}
			tokens = append(tokens, token)
		case b == ' ' || b == 'n' || b == 't':
			s.Next()
			continue
		case b == ';':
			s.Next()
			token := &DelimiterToken{ttype: TLineEnd}
			tokens = append(tokens, token)
		case b == '[':
			s.Next()
			token := &DelimiterToken{ttype: TLeftBracket}
			tokens = append(tokens, token)
		case b == ']':
			s.Next()
			token := &DelimiterToken{ttype: TRightBracket}
			tokens = append(tokens, token)
		case b == '{':
			s.Next()
			token := &DelimiterToken{ttype: TLeftBrace}
			tokens = append(tokens, token)
		case b == '}':
			s.Next()
			token := &DelimiterToken{ttype: TRightBrace}
			tokens = append(tokens, token)
		}
	}
	return tokens, nil
}

自此获取了全部的Token,下面可以解析结构体了。

这部分理解很简单,当获取一个Token是'i32'时,获取下一个token是该字段的名字,这个时候就需要分2钟情况处理。继续获取Token,如果是分号,说明这不是数组类型,如果获取是中括号就是数组类型了。内嵌结构体的话需要在全局Map中记录结构体的信息,当遇到一个类型非系统预留字(不是i32/i16/long/int这种),那检查是否已经有结构体解析结果,没有解析结果就报错。
type Lexer interface {
	// 获取当前token
	Fetch() (Token, error)
	// 获取当前Token,游标指向下一个Token
	Next() (Token, error)
	Index(n int) (Token, error)
	CurrentIndex() int
	// 解析声明
	Statement() ([]Field, error)
	// 解析单个结构体
	Structure() (*CStruct, error)
	// 解析整个头文件信息
	AllStructure() ([]*CStruct, error)
	// 是否存在结构体
	ExistStructure(name string) (*CStruct, bool)
}

type lexer struct {
	tokens     []Token
	index      int
	structures map[string]CStruct
}

func (l *lexer) Fetch() (Token, error) {
	if l.index >= len(l.tokens) {
		return nil, io.EOF
	}

	token := l.tokens[l.index]
	return token, nil
}

func (l *lexer) Next() (Token, error) {
	if l.index >= len(l.tokens) {
		return nil, io.EOF
	}
	token := l.tokens[l.index]
	l.index++
	return token, nil
}

func (l *lexer) Index(n int) (Token, error) {
	if n >= len(l.tokens) {
		return nil, io.EOF
	}

	token := l.tokens[n]
	return token, nil
}

func (l *lexer) CurrentIndex() int {
	return l.index
}

func (l *lexer) Statement() ([]Field, error) {
	result := make([]Field, 0)

	dataTypeToken, err := l.Next()
	if err != nil {
		return nil, err
	}

	nameToken, err := l.Next()
	if err != nil {
		return nil, err
	}

	tToken, err := l.Fetch()
	if err != nil {
		return nil, err
	}

	var dType FieldType
	switch dataTypeToken.Type() {
	case TInt64:
		dType = Int64
	case TInt32:
		dType = Int32
	case TInt16:
		dType = Int16
	case TInt8:
		dType = Int8
	case TUint64:
		dType = UInt64
	case TUint32:
		dType = UInt32
	case TUint16:
		dType = UInt16
	case TUint8:
		dType = UInt8
	case TFloat32:
		dType = Float32
	case TFloat64:
		dType = Float64
	case TByte:
		dType = Hex
	}

	// 基础类型
	if tToken.Type() == TLineEnd {
		name := nameToken.Value().(string)
		switch dataTypeToken.Type() {
		case TString:
			// 内嵌子集
			structName := dataTypeToken.Value().(string)
			cStruct, ok := l.ExistStructure(structName)
			if !ok {
				return nil, errors.New(fmt.Sprintf("%s struct not defined", structName))
			}
			fs, err := cStruct.ToStatement()
			if err != nil {
				return nil, errors.Wrap(err, "get statement from sub struct")
			}
			for i := range fs {
				fs[i].Name = fmt.Sprintf("%s_%s", name, fs[i].Name)
			}
			result = append(result, fs...)
		default:
			field := Field{
				Name: name,
				Type: dType,
				Size: getFieldTypeSize(dType),
			}
			result = append(result, field)
		}
		_, _ = l.Next()
		return result, nil
	}

	switch dataTypeToken.Type() {
	case TFloat64, TInt64, TInt32, TInt16, TInt8, TUint64, TUint32, TUint16, TUint8, TByte:
		if tToken.Type() == TLeftBracket {
			fs, err := l.arrayStatement(dataTypeToken, nameToken, nil, false)
			if err != nil {
				return nil, err
			}
			result = append(result, fs...)
		}
	case TString:
		structName := dataTypeToken.Value().(string)
		if tToken.Type() == TLeftBracket {
			if cStruct, ok := l.ExistStructure(structName); ok {
				fs, err := l.arrayStatement(dataTypeToken, nameToken, cStruct, true)
				if err != nil {
					return nil, err
				}
				result = append(result, fs...)
			}
		}
		return nil, errors.New(fmt.Sprintf("%s struct not defined", structName))
	}
	return result, nil
}

func (l *lexer) Structure() (*CStruct, error) {
	token, err := l.Next()
	if err != nil {
		return nil, err
	}

	switch token.Type() {
	case TTypedef:
		stToken, err := l.Next()
		if err != nil {
			return nil, err
		}

		if stToken.Type() != TStruct {
			return nil, errors.New("'struct' not found after typedef")
		}

		bToken, err := l.Fetch()
		if err != nil {
			return nil, err
		}

		if bToken.Type() != TLeftBrace {
			return nil, errors.New("'{' not found after typedef struct")
		}

		fs, err := l.structureInBrace()
		if err != nil {
			return nil, err
		}

		nameToken, err := l.Next()
		if err != nil {
			return nil, err
		}

		if nameToken.Type() != TString {
			return nil, errors.New("structure type not found")
		}

		name := nameToken.Value().(string)
		cStruct := CStruct{
			Name:     name,
			FieldSet: FieldSet(fs),
		}

		endToken, err := l.Next()
		if err != nil {
			return nil, errors.Wrap(err, "';' bot found after struct defined")
		}
		if endToken.Type() != TLineEnd {
			return nil, errors.Wrap(err, "';' bot found after struct defined")
		}

		return &cStruct, nil
	case TStruct:
		nameToken, err := l.Next()
		if err != nil {
			return nil, err
		}

		if nameToken.Type() != TString {
			return nil, errors.New("structure type not found")
		}

		fs, err := l.structureInBrace()
		if err != nil {
			return nil, err
		}
		name := nameToken.Value().(string)
		cStruct := CStruct{
			Name:     name,
			FieldSet: FieldSet(fs),
		}

		endToken, err := l.Next()
		if err != nil {
			return nil, errors.Wrap(err, "';' bot found after struct defined")
		}
		if endToken.Type() != TLineEnd {
			return nil, errors.Wrap(err, "';' bot found after struct defined")
		}

		return &cStruct, nil
	}
	return nil, errors.New("struct format error")
}

func (l *lexer) AllStructure() ([]*CStruct, error) {
	result := make([]*CStruct, 0)
	for {
		cStruct, err := l.Structure()
		if err != nil {
			return nil, err
		}
		result = append(result, cStruct)
		l.structures[cStruct.Name] = *cStruct

		_, err = l.Fetch()
		if err == io.EOF {
			break
		}
	}
	return result, nil
}

func (l *lexer) ExistStructure(name string) (*CStruct, bool) {
	c, ok := l.structures[name]
	return &c, ok
}

func (l *lexer) structureInBrace() ([]Field, error) {
	token, err := l.Next()
	if err != nil {
		return nil, err
	}
	if token.Type() != TLeftBrace {
		return nil, errors.New("'{' not found")
	}

	result := make([]Field, 0)

	for {
		fs, err := l.Statement()
		if err != nil {
			return nil, err
		}
		result = append(result, fs...)

		token, err := l.Fetch()
		if err != nil {
			return nil, err
		}
		if token.Type() == TRightBrace {
			_, _ = l.Next()
			break
		}
	}

	return result, nil
}

虽然也不指望各位能看完。。。