实验100000个goroutines和cpu个数goroutines、单线程进行对比

package main

import (
    "fmt"
    "runtime"
    "sync"
    "sync/atomic"
    "time"
)

func addConcurrent(num int) {
    var c int32
    atomic.StoreInt32(&c, 0)

    start := time.Now()

    for i := 0; i < num; i++ {
        go atomic.AddInt32(&c, 1)
    }
    for {
        if c == int32(num) {
            fmt.Println(time.Since(start))
            break
        }
    }
}

func addCPUNum(num int) {
    var c int32
    wg := &sync.WaitGroup{}
    core := runtime.NumCPU()
    start := time.Now()
    wg.Add(core)
    for i := 0; i < core; i++ {
        go func(wg *sync.WaitGroup) {
            for j := 0; j < num/core; j++ {
                atomic.AddInt32(&c, 1)
            }
            wg.Done()
        }(wg)

    }
    wg.Wait()
    fmt.Println(time.Since(start))
}

func addOneThread(num int) {
    var c int32
    start := time.Now()
    for i := 0; i < num; i++ {
        atomic.AddInt32(&c, 1)
    }
    fmt.Println(time.Since(start))
}

func main() {

    num := 100000
    addConcurrent(num)
    addCPUNum(num)
    addOneThread(num)

}

运行结果:

GOROOT=/usr/local/opt/go/libexec #gosetup
GOPATH=/Users/mar/go #gosetup
/usr/local/opt/go/libexec/bin/go build -o /private/var/folders/4b/65x09q517lj_6byhlcjbk4_m0000gn/T/GoLand/___go_build_main_go /Users/mar/Work/go/demo/main.go #gosetup
/private/var/folders/4b/65x09q517lj_6byhlcjbk4_m0000gn/T/GoLand/___go_build_main_go
32.04965ms
2.13698ms
616.344µs

1.100000个goroutines时间过长,主要原因是因为线程上下文切换有延迟代价。显然100000个goroutines处理这种cpu-bound的工作很不利。io-bound处理可以在io wait的时候去切换别的线程做其他事情,但是对于cpu-bound,它会一直处理work,线程切换会损害性能。
2.cpu数量goroutines时间过长,主要原因是false sharing(cache伪共享),每个core都会去共享变量c的相同cache行,频繁操作c会导致内存抖动(cache和主存直接的换页操作),在golang程序中需要避免因为cache伪共享导致的内存抖动,尽量避免多个线程去频繁操作一个相同变量或者是地址相邻变量。