Go 学习笔记（31）— 字符串 string

1. 字符串 string 类型

GoUTF-8runeUTF-8GoASCII

字符串是常量，可以通过类似数组索引访问其字节单元，但是不能修改某个字节的值；

var a string = "hello,world"
b := a[0]
a[1] = "a" // error

lenrune

s := "hello, world"
fmt.Println(len(s))     // "12"
fmt.Println(s[0], s[7]) // "104 119" ('h' and 'w')

ASCIIUTF8

[]byte(s)NULLC/C++

// runtime/string.go

type stringStruct struct {
    str unsafe . Pointer //指向底层字节数组的指针
    len int //字节数组长度
}

stringsliceUnicode

a := "hello , 世界!"
b := []byte(a)
c :=[]rune(a)

len()lenGolanglen

var str string	//string 类型变量在定义后默认的初始值是空，不是 nil。
if str == "" {
    // str 为空
}

或者

var str string
if len(str) == 0 {
    // str 为空
}

对于简单而少量的拼接，使用运算符＋和＋＝的效果虽然很好，但随着拼接操作次数的增加，这种做法的效率并不高。如果需要在循环中拼接字符串，则使用空的字节缓冲区来拼接的效率更高。

func main() {
	var buffer bytes.Buffer

	for i := 0; i < 500; i++ {
		buffer.WriteString("hello,world")
	}
	fmt.Println(buffer.String())	 // 对缓冲区调用函数String() 以字符串的方式输出结果。
}

&str[i]""

package main

import "fmt"

func main() {
	str := `
			first line,
			second line,
			third line,
			\r	
			\n
	`
// 在这种方式下，反引号间换行将被作为字符串中的换行，但是所有的转义字符均无效，文本将会原样输出。
	fmt.Println(str)
}

输出：

			first line,
			second line,
			third line,
			\r
			\n

示例代码如下：

package main

import "fmt"

func main() {
	var s string = "hello"
	var a string = ",world"
	s1 := s[0]
	s2 := s[:3]
	s3 := []byte(s)
	s4 := []rune(s)
	sLength := len(s)
	c := s + a
	fmt.Printf("s1 is %v, s1 type is %T\n", s1, s1)
	fmt.Printf("s2 is %v, s2 type is %T\n", s2, s2)
	fmt.Printf("s3 is %v, s3 type is %T\n", s3, s3)
	fmt.Printf("s4 is %v, s4 type is %T\n", s4, s4)
	fmt.Printf("sLength is %v \n", sLength)
	fmt.Printf("c is %v \n", c)
}

输出结果：

s1 is 104, s1 type is uint8
s2 is hel, s2 type is string
s3 is [104 101 108 108 111], s3 type is []uint8
s4 is [104 101 108 108 111], s4 type is []int32
sLength is 5 
c is hello,world

字符串值也可以用字符串面值方式编写，只要将一系列字节序列包含在双引号内即可：

"hello, 世界"

GoUTF8GoUTF8Unicode

\ASCII

\a      响铃
\b      退格
\f      换页
\n      换行
\r      回车
\t      制表符
\v      垂直制表符
\'      单引号（只用在 '\'' 形式的rune符号面值中）
\"      双引号（只用在 "..." 形式的字符串面值中）
\\      反斜杠

可以通过十六进制或八进制转义在字符串面值中包含任意的字节。

\xhhh\oooo\377

len

对于切片、字典、数组、通道类型的变量，它们中每一个元素就是一个长度；
对于 string 类型变量，它们每一个字节是一个长度；
对于 rune 类型切片变量，它们每一个字符是一个长度，rune 类型变量中的内容采用 UTF-8 编码，一个字符可能对应 4 个字节；

2. 字符 rune 类型

GoGo

uint8bytebyteuint8ASCIIruneUTF-8runeruneint32

runeint32runeUTF-8

GobyteUTF-8byteUTF-8

GoruneUTF-8

package main

import "fmt"

func main() {
	var str string = "中国"
	rangeRune([]rune(str))
	rangeStr(str)
}

func rangeRune(arg []rune) {
	fmt.Println("rune type arg length is ", len(arg))
	for i := 0; i < len(arg); i++ {
		fmt.Printf("i is %d, value is %c\n", i, arg[i])
	}
}

func rangeStr(arg string) {
	fmt.Println("str type arg length is ", len(arg))
	for i := 0; i < len(arg); i++ {
		fmt.Printf("i is %d, value is %c\n", i, arg[i])
	}
}

输出结果：

rune type arg length is  2
i is 0, value is 中
i is 1, value is 国
str type arg length is  6
i is 0, value is ä
i is 1, value is ¸
i is 2, value is 
i is 3, value is å
i is 4, value is ›
i is 5, value is ½

byterune

rune

3. 字节 byte 类型

byte

var b1 byte
var b2 = 'c'
var b3 byte = 'c'
b4 := 'c'

bytebyteuint8

func main() {
    s := "hello 世界"
    runeSlice := []rune(s) // len = 8
    byteSlice := []byte(s) // len = 12
    // 打印每个rune切片元素
    for i:= 0; i < len(runeSlice); i++ {
        fmt.Println(runeSlice[i])
        // 输出104 101 108 108 111 32 19990 30028
    }
    fmt.Println()
    // 打印每个byte切片元素
    for i:= 0; i < len(byteSlice); i++ {
        fmt.Println(byteSlice[i])
        // 输出104 101 108 108 111 32 228 184 150 231 149 140
    }
}

GoUTF-8rune[]runeruneUTF-8

[]bytebytebyte[]byte[]rune

runebyte

4. UTF-8 和 Unicode 有何区别

UnicodeASCII

UnicodeUnicodeUnicodeGoruneruneint32

int32UTF-32UCS-4UnicodeASCIIASCII

UTF8UnicodeUTF8UnicodeASCII

UTF-16编码存在一定的问题：无论是ASCII中定义的英文字符，还是复杂的中文字符，它都采用2个字节来存储。如果严格按照2个字节存储，编码号比较小的（如英文字母）的许多高位都为0（如字母t：00000000 01110100）。

UTF-16、UTF-8、还有其他五花八门的编码存储方式，都是Unicode的底层存储实现。用编程范式的语言来描述：Unicode是接口，定义了有哪些映射规则；而UTF-8、UTF-16则是Unicode这个接口的实现，它们在计算机底层实现了这些映射规则。

ASCIIASCIIASCIIUnicode

0xxxxxxx                             runes 0-127    (ASCII)
110xxxxx 10xxxxxx                    128-2047       (values <128 unused)
1110xxxx 10xxxxxx 10xxxxxx           2048-65535     (values <2048 unused)
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  65536-0x10ffff (other values unused)

UTF8

UTF8ASCIIGBKUTF8UnicodeUTF8NULLNULL

GoUTF8GoUTF8unicoderuneunicode/utf8runeUTF8

GoUnicodeUnicode

\uhhhh\Uhhhhhhhh

其中 h 是一个十六进制数字，一般很少需要使用 32bit 的形式。每一个对应码点的 UTF8 编码。例如：下面的字母串面值都表示相同的值：

"世界"
"\xe4\xb8\x96\xe7\x95\x8c"
"\u4e16\u754c"
"\U00004e16\U0000754c"

上面三个转义序列都为第一个字符串提供替代写法，但是它们的值都是相同的。

Unicoderune

'世' '\u4e16' '\U00004e16'

\x41'A'\u\U\xe4\xb8\x96runeUTF8

UTF8Unicode

import "unicode/utf8"
s := "Hello, 世界"
fmt.Println(len(s))                    // "13"
fmt.Println(utf8.RuneCountInString(s)) // "9"

UTF8unicode/utf8

for i := 0; i < len(s); {
    r, size := utf8.DecodeRuneInString(s[i:])
    fmt.Printf("%d\t%c\n", i, r)
    i += size
}

字符集为每个字符分配一个唯一的 ID，我们使用到的所有字符在 Unicode 字符集中都有一个唯一的 ID，例如上面例子中的 a 在 Unicode 与 ASCII 中的编码都是 97。汉字“你”在 Unicode 中的编码为 20320，在不同国家的字符集中，字符所对应的 ID 也会不同。而无论任何情况下，Unicode 中的字符的 ID 都是不会变化的。

UTF-8 是编码规则，将 Unicode 中字符的 ID 以某种方式进行编码，UTF-8 的是一种变长编码规则，从 1 到 4 个字节不等。编码规则如下：

0xxxxxx 表示文字符号 0～127，兼容 ASCII 字符集。
从 128 到 0x10ffff 表示其他字符。

根据这个规则，拉丁文语系的字符编码一般情况下每个字符占用一个字节，而中文每个字符占用 3 个字节。

广义的 Unicode 指的是一个标准，它定义了字符集及编码规则，即 Unicode 字符集和 UTF-8、UTF-16 编码等。

5. golang 中获取字符串长度

5.1 不同编码符串定义

UNICODEUTF-8

因此，字符串长度的获得，不等于按字节数查找，而要根据不同字符编码查找。

5.2 获取字符串长度的方法

golanglen()len()

bytes.Count()strings.Count()[]runelenutf8.RuneCountInString()

len()

package main

import (
	"bytes"
	"fmt"
	"strings"
	"unicode/utf8"
)

func main() {
	s := "hello,您好"
	s_length := len(s)

	fmt.Println(s_length)       // 12
	fmt.Println(len([]byte(s))) // 12

	byte_length := f1(s)
	fmt.Println(byte_length) // 8

	string_length := f2(s)
	fmt.Println(string_length) // 8

	rune_length := f3(s)
	fmt.Println(rune_length) // 8

	utf_length := f4(s)
	fmt.Println(utf_length) // 8
}

func f1(s string) int {
	return bytes.Count([]byte(s), nil) - 1
}

func f2(s string) int {
	return strings.Count(s, "") - 1
}

func f3(s string) int {
	return len([]rune(s))
}

func f4(s string) int {
	return utf8.RuneCountInString(s)
}

6. 字符串和Byte切片

Gostringrunebyte

runeGoUnicodeUnicodeUTF-8Unicoderune

package main

import (
	"fmt"
)

func main() {
	str := "Go爱好者"
	fmt.Printf("The string: %q\n", str)
	// %q 表示 该值对应的双引号括起来的 go 语法字符串字面值
	// 一个rune类型的值在底层其实就是一个 UTF-8 编码值
	fmt.Printf("  => runes(char): %q\n", []rune(str))
	// %x 表示按照 16 进制数来显示
	fmt.Printf("  => runes(hex): %x\n", []rune(str))
	// 把每个字符的 UTF-8 编码值都拆成相应的字节序列
	fmt.Printf("  => bytes(hex): [% x]\n", []byte(str))
}

输出结果：

The string: "Go爱好者"
  => runes(char): ['G' 'o' '爱' '好' '者']
  => runes(hex): [47 6f 7231 597d 8005]
  => bytes(hex): [47 6f e7 88 b1 e5 a5 bd e8 80 85]

UTF-8

这两种表示法展现出来的内容往往会很不一样。比如，对于中文字符’爱’来说，它的 UTF-8 编码值可以展现为单一的整数7231，也可以展现为三个整数，即：e7、88和b1。

stringUnicodeUnicoderuneUTF-8UTF-8

stringUTF-8

package main

import "fmt"

func main() {
	str := "Go爱好者"
	for i, c := range str {
		fmt.Printf("%d: %q [% x]\n", i, c, []byte(string(c)))
	}
}

输出结果：

0: 'G' [47]
1: 'o' [6f]
2: '爱' [e7 88 b1]
5: '好' [e5 a5 bd]
8: '者' [e8 80 85]

forUnicodeUnicodeUnicodeUnicodeUTF-8

6.1 字符串和Byte 切换

string[]byteUTF-8

ASCIIUTF-8

string([]byte{'\xe4', '\xbd', '\xa0', '\xe5', '\xa5', '\xbd'}) // 你好

UTF-8

string[]runeUnicode

string([]rune{'\u4F60', '\u597D'}) // 你好

完整示例代码：

package main

import (
	"fmt"
)

func main() {
	srcStr := "你好"
	fmt.Printf("The string: %q\n", srcStr)            // The string: "你好"
	fmt.Printf("The hex of %q: %x\n", srcStr, srcStr) // The hex of "你好": e4bda0e5a5bd
	fmt.Printf("The byte slice of %q: % x\n", srcStr, []byte(srcStr))
	//	The byte slice of "你好": e4 bd a0 e5 a5 bd
	fmt.Printf("The string: %q\n", string([]byte{'\xe4', '\xbd', '\xa0', '\xe5', '\xa5', '\xbd'}))
	// The string: "你好"
	fmt.Printf("The rune slice of %q: %U\n", srcStr, []rune(srcStr))
	// The rune slice of "你好": [U+4F60 U+597D]
	fmt.Printf("The string: %q\n", string([]rune{'\u4F60', '\u597D'}))
	// The string: "你好"
}

6.2 字符串处理标准包

bytesstringsstrconvunicode

stringsbytes[]bytebytes.BufferstrconvunicodeIsDigitIsLetterIsUpperIsLowerruneToUpperToLowerruneUnicodestringsToUpperToLower

basenameUnix shellbasename(s)

fmt.Println(basename("a/b/c.go")) // "c"
fmt.Println(basename("c.d.go"))   // "c.d"
fmt.Println(basename("abc"))      // "abc"

pathpath/filepathURL

path/filepathPOSIX/foo/barMicrosoft Windowsc:\foo\bar

sliceslice

s := "abc"
b := []byte(s)
s2 := string(b)

[]byte(s)bsslicestring(b)s2

bytesstringsstrings

func Contains(s, substr string) bool
func Count(s, sep string) int
func Fields(s string) []string
func HasPrefix(s, prefix string) bool
func Index(s, sep string) int
func Join(a []string, sep string) string

bytes

func Contains(b, subslice []byte) bool
func Count(s, sep []byte) int
func Fields(s []byte) [][]byte
func HasPrefix(s, prefix []byte) bool
func Index(s, sep []byte) int
func Join(s [][]byte, sep []byte) []byte

slice

bytesBuffersliceBufferstringbyte[]bytebytes.Buffer

// intsToString is like fmt.Sprint(values) but adds commas.
func intsToString(values []int) string {
    var buf bytes.Buffer
    buf.WriteByte('[')
    for i, v := range values {
        if i > 0 {
            buf.WriteString(", ")
        }
        fmt.Fprintf(&buf, "%d", v)
    }
    buf.WriteByte(']')
    return buf.String()
}
func main() {
    fmt.Println(intsToString([]int{1, 2, 3})) // "[1, 2, 3]"
}

bytes.BufferUTF8bytes.BufferWriteRuneWriteByte'['']'ASCII

7. 字符串和数字的转换

strconv

fmt.Sprintfstrconv.Itoa(“整数到ASCII”)

x := 123
y := fmt.Sprintf("%d", x)
fmt.Println(y, strconv.Itoa(x)) // "123 123"

FormatIntFormatUint

fmt.Println(strconv.FormatInt(int64(x), 2)) // "1111011"

fmt.Printf%b%d%o%xstrconvFormat

s := fmt.Sprintf("x=%b", x) // "x=1111011"

strconvAtoiParseIntParseUint

x, err := strconv.Atoi("123")             // x is an int
y, err := strconv.ParseInt("123", 10, 64) // base 10, up to 64 bits

ParseIntint16intint64

fmt.Scanf

参考：
Go 语言圣经