在了解完boltDB如何使用后,我们开始详细分析boltdb的源码,我们从创建实例函数bolt.Open开始,它的源码位于db.go,它的第一个参数是文件名称,第二个参数是权限信息,第三个参数是创建数据库实例的可选参数,具体定义如下:

// Options represents the options that can be set when opening a database.
type Options struct {
  // Timeout is the amount of time to wait to obtain a file lock.
  // When set to zero it will wait indefinitely. This option is only
  // available on Darwin and Linux.
  Timeout time.Duration


  // Sets the DB.NoGrowSync flag before memory mapping the file.
  NoGrowSync bool


  // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
  // grab a shared lock (UNIX).
  ReadOnly bool


  // Sets the DB.MmapFlags flag before memory mapping the file.
  MmapFlags int


  // InitialMmapSize is the initial mmap size of the database
  // in bytes. Read transactions won't block write transaction
  // if the InitialMmapSize is large enough to hold database mmap
  // size. (See DB.Begin for more information)
  //
  // If <=0, the initial map size is 0.
  // If initialMmapSize is smaller than the previous database size,
  // it takes no effect.
  InitialMmapSize int
}

包括超时时间,mmap的初始大小等参数,返回的是一个db对象:

// DB represents a collection of buckets persisted to a file on disk.
// All data access is performed through transactions which can be obtained through the DB.
// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
type DB struct {
  // When enabled, the database will perform a Check() after every commit.
  // A panic is issued if the database is in an inconsistent state. This
  // flag has a large performance impact so it should only be used for
  // debugging purposes.
  StrictMode bool


  // Setting the NoSync flag will cause the database to skip fsync()
  // calls after each commit. This can be useful when bulk loading data
  // into a database and you can restart the bulk load in the event of
  // a system failure or database corruption. Do not set this flag for
  // normal use.
  //
  // If the package global IgnoreNoSync constant is true, this value is
  // ignored.  See the comment on that constant for more details.
  //
  // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
  NoSync bool


  // When true, skips the truncate call when growing the database.
  // Setting this to true is only safe on non-ext3/ext4 systems.
  // Skipping truncation avoids preallocation of hard drive space and
  // bypasses a truncate() and fsync() syscall on remapping.
  //
  // https://github.com/boltdb/bolt/issues/284
  NoGrowSync bool


  // If you want to read the entire database fast, you can set MmapFlag to
  // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
  MmapFlags int


  // MaxBatchSize is the maximum size of a batch. Default value is
  // copied from DefaultMaxBatchSize in Open.
  //
  // If <=0, disables batching.
  //
  // Do not change concurrently with calls to Batch.
  MaxBatchSize int


  // MaxBatchDelay is the maximum delay before a batch starts.
  // Default value is copied from DefaultMaxBatchDelay in Open.
  //
  // If <=0, effectively disables batching.
  //
  // Do not change concurrently with calls to Batch.
  MaxBatchDelay time.Duration


  // AllocSize is the amount of space allocated when the database
  // needs to create new pages. This is done to amortize the cost
  // of truncate() and fsync() when growing the data file.
  AllocSize int


  path     string
  file     *os.File
  lockfile *os.File // windows only
  dataref  []byte   // mmap'ed readonly, write throws SEGV
  data     *[maxMapSize]byte
  datasz   int
  filesz   int // current on disk file size
  meta0    *meta
  meta1    *meta
  pageSize int
  opened   bool
  rwtx     *Tx
  txs      []*Tx
  freelist *freelist
  stats    Stats


  pagePool sync.Pool


  batchMu sync.Mutex
  batch   *batch


  rwlock   sync.Mutex   // Allows only one writer at a time.
  metalock sync.Mutex   // Protects meta page access.
  mmaplock sync.RWMutex // Protects mmap access during remapping.
  statlock sync.RWMutex // Protects stats access.


  ops struct {
    writeAt func(b []byte, off int64) (n int, err error)
  }


  // Read only mode.
  // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
  readOnly bool
}

它核心数据包含了两个meta信息,一个写事物的引用,多个读事物的引用,以及空闲页列表的引用:

meta0    *meta
  meta1    *meta
  pageSize int
  opened   bool
  rwtx     *Tx
  txs      []*Tx
  freelist *freelist

并且通过批量锁保证一次只能执行一个批量操作,通过读写锁保证一次只能有一个写事务,通过metalock保证一次只能有一个事务更新meta信息,通过文件锁保证内存映射的安全性和统计信息的并发安全

  batchMu sync.Mutex
  batch   *batch

  rwlock   sync.Mutex   // Allows only one writer at a time.
  metalock sync.Mutex   // Protects meta page access.
  mmaplock sync.RWMutex // Protects mmap access during remapping.
  statlock sync.RWMutex // Protects stats access.

Open函数先创建db对象,然后根据option初始化一些列参数,创建内存映射对应的文件,加文件锁,读取文件信息。如果文件不存在通过init函数来进行db文件的初始化,函数内部会通过内存映射方式关联文件和内存模块,否则读取文件的前四个磁盘页,验证meta页的合法性,并根据freelist页信息来重新构建空闲页列表的缓存信息:

func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
  var db = &DB{opened: true}
          if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
    _ = db.close()
          if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
    _ = db.close()
        if info, err := db.file.Stat(); err != nil {
          } else if info.Size() == 0 {
    // Initialize new files with meta pages.
    if err := db.init(); err != nil {
        var buf [0x1000]byte
    if _, err := db.file.ReadAt(buf[:], 0); err == nil {
      m := db.pageInBuffer(buf[:], 0).meta()
      if err := m.validate(); err != nil {
        db.pageSize = os.Getpagesize()
          db.pagePool = sync.Pool{
    New: func() interface{} {
      return make([]byte, db.pageSize)
    },
  }
        if err := db.mmap(options.InitialMmapSize); err != nil {
    _ = db.close()
          db.freelist = newFreelist()
  db.freelist.read(db.page(db.meta().freelist))
      // pageInBuffer retrieves a page reference from a given byte array based on the current page size.

其中将页面读人内存buf的过程就是纯指针操作

func (db *DB) pageInBuffer(b []byte, id pgid) *page {
  return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
}

meta信息的校验主要验证了魔数、版本、以及checksum的有效性:

func (m *meta) validate() error {
  if m.magic != magic {
    return ErrInvalid
  } else if m.version != version {
    return ErrVersionMismatch
  } else if m.checksum != 0 && m.checksum != m.sum64() {
    return ErrChecksum

memap映射文件后,会加载文件的内容到meta里面:

      // mmap opens the underlying memory-mapped file and initializes the meta references.
// minsz is the minimum size that the new mmap can be.
func (db *DB) mmap(minsz int) error {
  db.mmaplock.Lock()
  defer db.mmaplock.Unlock()
        info, err := db.file.Stat()
        var size = int(info.Size())
        size, err = db.mmapSize(size)
          // Dereference all mmap references before unmapping.
  if db.rwtx != nil {
    db.rwtx.root.dereference()
  }
          // Unmap existing data before continuing.
  if err := db.munmap(); err != nil {
    return err
  }
          // Memory-map the data file as a byte slice.
  if err := mmap(db, size); err != nil {
    return err
  }
          // Save references to the meta pages.
  db.meta0 = db.page(0).meta()
  db.meta1 = db.page(1).meta()
          err0 := db.meta0.validate()
  err1 := db.meta1.validate()

内存映射的时候需要找出最接近的满足大小需求的2的幂次方的内存大小:

    // mmapSize determines the appropriate size for the mmap given the current size
// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
// Returns an error if the new mmap size is greater than the max allowed.
func (db *DB) mmapSize(size int) (int, error) {
          // Double the size from 32KB until 1GB.
  for i := uint(15); i <= 30; i++ {
    if size <= 1<<i {
      return 1 << i, nil
    }
  }

munmap的过程是相反的,其中根据pageId获取页信息也是纯内存操作,因为page信息代表了磁盘上的存储结构:

    // munmap unmaps the data file from memory.
func (db *DB) munmap() error {
  if err := munmap(db); err != nil {
func (db *DB) page(id pgid) *page {
  pos := id * pgid(db.pageSize)
  return (*page)(unsafe.Pointer(&db.data[pos]))
}

其中文件锁的逻辑定义在bolt_unix.go,本质上还是系统调用

func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error {
        for {
          } else if timeout > 0 && time.Since(t) > timeout {
      return ErrTimeout
          flag := syscall.LOCK_SH
    if exclusive {
      flag = syscall.LOCK_EX
    }
          err := syscall.Flock(int(db.file.Fd()), flag|syscall.LOCK_NB)
    if err == nil {
      return nil
    } else if err != syscall.EWOULDBLOCK {
      return err
    }
          time.Sleep(50 * time.Millisecond)

内存映射和解除映射同样是系统函数调用:

func munmap(db *DB) error {
  // Ignore the unmap if we have no mapped data.
  if db.dataref == nil {
    return nil
  }


  // Unmap using the original byte slice.
  err := syscall.Munmap(db.dataref)  
// mmap memory maps a DB's data file.
func mmap(db *DB, sz int) error {
        b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
        if err := madvise(b, syscall.MADV_RANDOM); err != nil {
func madvise(b []byte, advice int) (err error) {
  _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice))

获取meta信息的过程也是纯指针操作,函数定义在page.go

func (p *page) meta() *meta {
  return (*meta)(unsafe.Pointer(&p.ptr))
}