在了解完boltDB如何使用后,我们开始详细分析boltdb的源码,我们从创建实例函数bolt.Open开始,它的源码位于db.go,它的第一个参数是文件名称,第二个参数是权限信息,第三个参数是创建数据库实例的可选参数,具体定义如下:
// Options represents the options that can be set when opening a database.
type Options struct {
// Timeout is the amount of time to wait to obtain a file lock.
// When set to zero it will wait indefinitely. This option is only
// available on Darwin and Linux.
Timeout time.Duration
// Sets the DB.NoGrowSync flag before memory mapping the file.
NoGrowSync bool
// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
// grab a shared lock (UNIX).
ReadOnly bool
// Sets the DB.MmapFlags flag before memory mapping the file.
MmapFlags int
// InitialMmapSize is the initial mmap size of the database
// in bytes. Read transactions won't block write transaction
// if the InitialMmapSize is large enough to hold database mmap
// size. (See DB.Begin for more information)
//
// If <=0, the initial map size is 0.
// If initialMmapSize is smaller than the previous database size,
// it takes no effect.
InitialMmapSize int
}
包括超时时间,mmap的初始大小等参数,返回的是一个db对象:
// DB represents a collection of buckets persisted to a file on disk.
// All data access is performed through transactions which can be obtained through the DB.
// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
type DB struct {
// When enabled, the database will perform a Check() after every commit.
// A panic is issued if the database is in an inconsistent state. This
// flag has a large performance impact so it should only be used for
// debugging purposes.
StrictMode bool
// Setting the NoSync flag will cause the database to skip fsync()
// calls after each commit. This can be useful when bulk loading data
// into a database and you can restart the bulk load in the event of
// a system failure or database corruption. Do not set this flag for
// normal use.
//
// If the package global IgnoreNoSync constant is true, this value is
// ignored. See the comment on that constant for more details.
//
// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
NoSync bool
// When true, skips the truncate call when growing the database.
// Setting this to true is only safe on non-ext3/ext4 systems.
// Skipping truncation avoids preallocation of hard drive space and
// bypasses a truncate() and fsync() syscall on remapping.
//
// https://github.com/boltdb/bolt/issues/284
NoGrowSync bool
// If you want to read the entire database fast, you can set MmapFlag to
// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
MmapFlags int
// MaxBatchSize is the maximum size of a batch. Default value is
// copied from DefaultMaxBatchSize in Open.
//
// If <=0, disables batching.
//
// Do not change concurrently with calls to Batch.
MaxBatchSize int
// MaxBatchDelay is the maximum delay before a batch starts.
// Default value is copied from DefaultMaxBatchDelay in Open.
//
// If <=0, effectively disables batching.
//
// Do not change concurrently with calls to Batch.
MaxBatchDelay time.Duration
// AllocSize is the amount of space allocated when the database
// needs to create new pages. This is done to amortize the cost
// of truncate() and fsync() when growing the data file.
AllocSize int
path string
file *os.File
lockfile *os.File // windows only
dataref []byte // mmap'ed readonly, write throws SEGV
data *[maxMapSize]byte
datasz int
filesz int // current on disk file size
meta0 *meta
meta1 *meta
pageSize int
opened bool
rwtx *Tx
txs []*Tx
freelist *freelist
stats Stats
pagePool sync.Pool
batchMu sync.Mutex
batch *batch
rwlock sync.Mutex // Allows only one writer at a time.
metalock sync.Mutex // Protects meta page access.
mmaplock sync.RWMutex // Protects mmap access during remapping.
statlock sync.RWMutex // Protects stats access.
ops struct {
writeAt func(b []byte, off int64) (n int, err error)
}
// Read only mode.
// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
readOnly bool
}
它核心数据包含了两个meta信息,一个写事物的引用,多个读事物的引用,以及空闲页列表的引用:
meta0 *meta
meta1 *meta
pageSize int
opened bool
rwtx *Tx
txs []*Tx
freelist *freelist
并且通过批量锁保证一次只能执行一个批量操作,通过读写锁保证一次只能有一个写事务,通过metalock保证一次只能有一个事务更新meta信息,通过文件锁保证内存映射的安全性和统计信息的并发安全
batchMu sync.Mutex
batch *batch
rwlock sync.Mutex // Allows only one writer at a time.
metalock sync.Mutex // Protects meta page access.
mmaplock sync.RWMutex // Protects mmap access during remapping.
statlock sync.RWMutex // Protects stats access.
Open函数先创建db对象,然后根据option初始化一些列参数,创建内存映射对应的文件,加文件锁,读取文件信息。如果文件不存在通过init函数来进行db文件的初始化,函数内部会通过内存映射方式关联文件和内存模块,否则读取文件的前四个磁盘页,验证meta页的合法性,并根据freelist页信息来重新构建空闲页列表的缓存信息:
func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
var db = &DB{opened: true}
if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
_ = db.close()
if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
_ = db.close()
if info, err := db.file.Stat(); err != nil {
} else if info.Size() == 0 {
// Initialize new files with meta pages.
if err := db.init(); err != nil {
var buf [0x1000]byte
if _, err := db.file.ReadAt(buf[:], 0); err == nil {
m := db.pageInBuffer(buf[:], 0).meta()
if err := m.validate(); err != nil {
db.pageSize = os.Getpagesize()
db.pagePool = sync.Pool{
New: func() interface{} {
return make([]byte, db.pageSize)
},
}
if err := db.mmap(options.InitialMmapSize); err != nil {
_ = db.close()
db.freelist = newFreelist()
db.freelist.read(db.page(db.meta().freelist))
// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
其中将页面读人内存buf的过程就是纯指针操作
func (db *DB) pageInBuffer(b []byte, id pgid) *page {
return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
}
meta信息的校验主要验证了魔数、版本、以及checksum的有效性:
func (m *meta) validate() error {
if m.magic != magic {
return ErrInvalid
} else if m.version != version {
return ErrVersionMismatch
} else if m.checksum != 0 && m.checksum != m.sum64() {
return ErrChecksum
memap映射文件后,会加载文件的内容到meta里面:
// mmap opens the underlying memory-mapped file and initializes the meta references.
// minsz is the minimum size that the new mmap can be.
func (db *DB) mmap(minsz int) error {
db.mmaplock.Lock()
defer db.mmaplock.Unlock()
info, err := db.file.Stat()
var size = int(info.Size())
size, err = db.mmapSize(size)
// Dereference all mmap references before unmapping.
if db.rwtx != nil {
db.rwtx.root.dereference()
}
// Unmap existing data before continuing.
if err := db.munmap(); err != nil {
return err
}
// Memory-map the data file as a byte slice.
if err := mmap(db, size); err != nil {
return err
}
// Save references to the meta pages.
db.meta0 = db.page(0).meta()
db.meta1 = db.page(1).meta()
err0 := db.meta0.validate()
err1 := db.meta1.validate()
内存映射的时候需要找出最接近的满足大小需求的2的幂次方的内存大小:
// mmapSize determines the appropriate size for the mmap given the current size
// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
// Returns an error if the new mmap size is greater than the max allowed.
func (db *DB) mmapSize(size int) (int, error) {
// Double the size from 32KB until 1GB.
for i := uint(15); i <= 30; i++ {
if size <= 1<<i {
return 1 << i, nil
}
}
munmap的过程是相反的,其中根据pageId获取页信息也是纯内存操作,因为page信息代表了磁盘上的存储结构:
// munmap unmaps the data file from memory.
func (db *DB) munmap() error {
if err := munmap(db); err != nil {
func (db *DB) page(id pgid) *page {
pos := id * pgid(db.pageSize)
return (*page)(unsafe.Pointer(&db.data[pos]))
}
其中文件锁的逻辑定义在bolt_unix.go,本质上还是系统调用
func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error {
for {
} else if timeout > 0 && time.Since(t) > timeout {
return ErrTimeout
flag := syscall.LOCK_SH
if exclusive {
flag = syscall.LOCK_EX
}
err := syscall.Flock(int(db.file.Fd()), flag|syscall.LOCK_NB)
if err == nil {
return nil
} else if err != syscall.EWOULDBLOCK {
return err
}
time.Sleep(50 * time.Millisecond)
内存映射和解除映射同样是系统函数调用:
func munmap(db *DB) error {
// Ignore the unmap if we have no mapped data.
if db.dataref == nil {
return nil
}
// Unmap using the original byte slice.
err := syscall.Munmap(db.dataref)
// mmap memory maps a DB's data file.
func mmap(db *DB, sz int) error {
b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
if err := madvise(b, syscall.MADV_RANDOM); err != nil {
func madvise(b []byte, advice int) (err error) {
_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice))
获取meta信息的过程也是纯指针操作,函数定义在page.go
func (p *page) meta() *meta {
return (*meta)(unsafe.Pointer(&p.ptr))
}