经过上一回的努力,我们终于将 GPU 计算的时间缩减到同 CPU 一个数量级,但是发现内存和显存之间的 memcpy 成了最主要的性能损耗。

一、固定内存

cudaMallocHost
const size_t NTB = 256;
const size_t EXT = 8;
#define divCeil(a, b) (((a) + (b) - 1) / (b))

struct Ctx {
    float *x, *y;
    float *xd, *yd, *rd;
    size_t n;
};

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {
    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));
    ctx->n = n;
    size_t sz = sizeof(float) * n;
    cudaMallocHost(&(ctx->x), sz);
    cudaMallocHost(&(ctx->y), sz);
    cudaMalloc(&(ctx->xd), sz);
    cudaMalloc(&(ctx->yd), sz);
    cudaMallocManaged(&(ctx->rd), sizeof(float) * divCeil(n, NTB) / EXT);
    *p = ctx;
}

extern "C" __declspec(dllexport) void getInputs(Ctx *ctx, float **px, float **py) {
    *px = ctx->x;
    *py = ctx->y;
}

extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {
    cudaFreeHost(ctx->x);
    cudaFreeHost(ctx->y);
    cudaFree(ctx->xd);
    cudaFree(ctx->yd);
    cudaFree(ctx->rd);
    free(ctx);
}

__global__ void devDot(float *x, float *y, size_t n, float *r) {
    __shared__ float rb[NTB];
    size_t itb = threadIdx.x;
    size_t i = blockIdx.x * blockDim.x * EXT + itb;
    float s = 0.0;
    for (size_t j = 0; j < EXT && i < n; j++, i += blockDim.x) {
        s += x[i] * y[i];
    }

    rb[itb] = s;
    __syncthreads();
    for (size_t i = NTB >> 1; i != 0; i >>= 1) {
        if (itb < i) rb[itb] += rb[itb + i];
        __syncthreads();
    }
    if (0 == itb) r[blockIdx.x] = rb[0];
}

extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {
    size_t sz = sizeof(float) * ctx->n;
    cudaMemcpy(ctx->xd, ctx->x, sz, cudaMemcpyHostToDevice);
    cudaMemcpy(ctx->yd, ctx->y, sz, cudaMemcpyHostToDevice);
    size_t nb = divCeil(ctx->n, NTB) / EXT;
    float *rd = ctx->rd;
    devDot<<<nb, NTB>>>(ctx->xd, ctx->yd, ctx->n, rd);
    cudaDeviceSynchronize();
    float s = 0.0;
    for (size_t i = 0; i < nb; i++) s += rd[i];
    *r = s;
}
package main

import (
    "math/rand"
    "reflect"
    "syscall"
    "time"
    "unsafe"
)

const N = 1 << 20

type Lib struct {
    dll        *syscall.DLL
    deinitProc *syscall.Proc
    dotProc    *syscall.Proc
    handler    uintptr
    X, Y       []float32
}

func LoadLib() (*Lib, error) {
    l := &Lib{}
    var err error
    defer func() {
        if nil != err {
            l.Release()
        }
    }()

    if l.dll, err = syscall.LoadDLL("cuda.dll"); nil != err {
        return nil, err
    }
    if l.deinitProc, err = l.dll.FindProc("deinit"); nil != err {
        return nil, err
    }
    if l.dotProc, err = l.dll.FindProc("dot"); nil != err {
        return nil, err
    }

    proc, err := l.dll.FindProc("init")
    if nil != err {
        return nil, err
    }
    proc.Call(uintptr(unsafe.Pointer(&l.handler)), uintptr(N))

    proc, err = l.dll.FindProc("getInputs")
    if nil != err {
        return nil, err
    }
    xh := (*reflect.SliceHeader)(unsafe.Pointer(&l.X))
    yh := (*reflect.SliceHeader)(unsafe.Pointer(&l.Y))
    xh.Len, xh.Cap, yh.Len, yh.Cap = N, N, N, N
    proc.Call(l.handler,
        uintptr(unsafe.Pointer(&xh.Data)), uintptr(unsafe.Pointer(&yh.Data)))

    return l, nil
}

func (l *Lib) Release() {
    if nil != l.deinitProc && 0 != l.handler {
        l.deinitProc.Call(l.handler)
    }
    if nil != l.dll {
        l.dll.Release()
    }
}

func (l *Lib) Dot() float32 {
    var r float32
    l.dotProc.Call(l.handler, uintptr(unsafe.Pointer(&r)))
    return r
}

func main() {
    lib, err := LoadLib()
    if nil != err {
        println(err.Error())
        return
    }
    defer lib.Release()

    rand.Seed(time.Now().Unix())
    x, y := lib.X, lib.Y
    for i := 0; i < N; i++ {
        x[i], y[i] = rand.Float32(), rand.Float32()
    }

    t := time.Now()
    var r float32
    for i := 0; i < 100; i++ {
        r = 0
        for i := 0; i < N; i++ {
            r += x[i] * y[i]
        }
    }
    println(time.Now().Sub(t).Microseconds())
    println(r)

    t = time.Now()
    for i := 0; i < 100; i++ {
        r = lib.Dot()
    }
    println(time.Now().Sub(t).Microseconds())
    println(r)
}
cudaMemcpy

二、内存映射

cudaHostAlloc

c 端有改动的代码如下:

struct Ctx {
    float *x, *y, *r;
    size_t n;
};

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {
    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));
    ctx->n = n;
    size_t sz = sizeof(float) * n;
    cudaHostAlloc(&(ctx->x), sz, cudaHostAllocMapped);
    cudaHostAlloc(&(ctx->y), sz, cudaHostAllocMapped);
    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);
    *p = ctx;
}

extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {
    cudaFreeHost(ctx->x);
    cudaFreeHost(ctx->y);
    cudaFree(ctx->r);
    free(ctx);
}

extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {
    size_t nb = divCeil(ctx->n, NTB) / EXT;
    float *rd = ctx->r;
    devDot<<<nb, NTB>>>(ctx->x, ctx->y, ctx->n, rd);
    cudaDeviceSynchronize();
    float s = 0.0;
    for (size_t i = 0; i < nb; i++) s += rd[i];
    *r = s;
}
cudaMemcpydevDot

三、统一寻址

cudaMallocManaged

c 端有改动的代码如下:

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {
    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));
    ctx->n = n;
    size_t sz = sizeof(float) * n;
    cudaMallocManaged(&(ctx->x), sz);
    cudaMallocManaged(&(ctx->y), sz);
    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);
    *p = ctx;
}

extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {
    cudaFree(ctx->x);
    cudaFree(ctx->y);
    cudaFree(ctx->r);
    free(ctx);
}
devDotcudaMemcpycudaLaunchKernel