34

Golang Map 源码阅读

 5 years ago
source link: https://jiajunhuang.com/articles/2017_07_27-golang_map.md.html?amp%3Butm_medium=referral
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

我们知道哈希表解决冲突一般要么是开放定址法,要么是再哈希法。Golang采用的是 第一种,但是实现上和教科书式的实现方式不同。这也是很有趣的一个东西,就像之前 看Python的 deque 一样,和教科书不一样,非常高效的实现方式。

通常教科书式的实现方式是,hash值重复的节点组成一个链表,首先我们根据hash值 定位到大概在哪里,然后遍历这个链表。这样有一个缺点,就是不知道这个链表到底有多长。

Golang的实现避免了这种问题,就是采用桶的方式,也就是每个坑后面接固定长度的键值对。

// A header for a Go map.
type hmap struct {
    // Note: the format of the Hmap is encoded in ../../cmd/internal/gc/reflect.go and
    // ../reflect/type.go. Don't change this structure without also changing that code!
    count     int // # live cells == size of map.  Must be first (used by len() builtin)
    flags     uint8
    B         uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
    noverflow uint16 // approximate number of overflow buckets; see incrnoverflow for details
    hash0     uint32 // hash seed

    buckets    unsafe.Pointer // array of 2^B Buckets. may be nil if count==0.
    oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
    nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)

    extra *mapextra // optional fields
}

// mapextra holds fields that are not present on all maps.
type mapextra struct {
    // If both key and value do not contain pointers and are inline, then we mark bucket
    // type as containing no pointers. This avoids scanning such maps.
    // However, bmap.overflow is a pointer. In order to keep overflow buckets
    // alive, we store pointers to all overflow buckets in hmap.overflow.
    // Overflow is used only if key and value do not contain pointers.
    // overflow[0] contains overflow buckets for hmap.buckets.
    // overflow[1] contains overflow buckets for hmap.oldbuckets.
    // The indirection allows to store a pointer to the slice in hiter.
    overflow [2]*[]*bmap

    // nextOverflow holds a pointer to a free overflow bucket.
    nextOverflow *bmap
}

// A bucket for a Go map.
type bmap struct {
    // tophash generally contains the top byte of the hash value
    // for each key in this bucket. If tophash[0] < minTopHash,
    // tophash[0] is a bucket evacuation state instead.
    tophash [bucketCnt]uint8
    // Followed by bucketCnt keys and then bucketCnt values.
    // NOTE: packing all the keys together and then all the values together makes the
    // code a bit more complicated than alternating key/value/key/value/... but it allows
    // us to eliminate padding which would be needed for, e.g., map[int64]int8.
    // Followed by an overflow pointer.
}

不过看到这里我就想吐槽了,Golang虽然标准库设计的很科学,但是很多命名实在是太简单了, 不方便记忆。估计只有写的人自己才能一眼反应过来吧。例如 B uint8 // log_2 of # of buckets (can hold up to loadFactor * 2^B items) 要是我的话,会选择稍微长一点,更容易记住的名字。

buckets 是一个指针,指向的就是 *bmap 这种键值对的数组。 tophash[0] 是数组 里的第一个值,相当于链表里的第一个。不过如果冲突到了多于8个怎么办?所以 里面还有 overflow 这样的指针,指向两个 *[]*bmap slice。这样就相当于链表了, 假设冲突已经到了这种层次的话。

下面来看看Golang的map是怎么查找的,我们知道Go的一个坑点就是字典不管找没找到, 都有返回,默认返回value对应类型的 zero object 。所以要用类似 a, ok := m["hello"] 的形式,然后判断 ok 。麻烦。

// mapaccess1 returns a pointer to h[key].  Never returns nil, instead
// it will return a reference to the zero object for the value type if
// the key is not in the map.
// NOTE: The returned pointer may keep the whole map live, so don't
// hold onto it for very long.
func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    if raceenabled && h != nil {
        callerpc := getcallerpc(unsafe.Pointer(&t))
        pc := funcPC(mapaccess1)
        racereadpc(unsafe.Pointer(h), callerpc, pc)
        raceReadObjectPC(t.key, key, callerpc, pc)
    }
    if msanenabled && h != nil {
        msanread(key, t.key.size)
    }
    // 如果h里面啥也没有的话,直接返回
    if h == nil || h.count == 0 {
        return unsafe.Pointer(&zeroVal[0])
    }
    // 原来上面结构体里flags是用来做读写状态标识的
    if h.flags&hashWriting != 0 {
        throw("concurrent map read and map write")
    }
    // t是map的类型,Go是编译型语言,所以在编译的时候应该就确定好了
    // 把key的类型确定好,hash算法固定好,直接用就好了
    alg := t.key.alg
    hash := alg.hash(key, uintptr(h.hash0))
    m := uintptr(1)<<h.B - 1
    b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
    if c := h.oldbuckets; c != nil {
        if !h.sameSizeGrow() {
            // There used to be half as many buckets; mask down one more power of two.
            m >>= 1
        }
        oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
        if !evacuated(oldb) {
            b = oldb
        }
    }
    // 算出在哪个桶,哈希值取余?
    top := uint8(hash >> (sys.PtrSize*8 - 8))
    if top < minTopHash {
        top += minTopHash
    }
    // 依次遍历,找出对象
    for {
        for i := uintptr(0); i < bucketCnt; i++ {
            if b.tophash[i] != top {
                continue
            }
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            if t.indirectkey {
                k = *((*unsafe.Pointer)(k))
            }
            if alg.equal(key, k) {
                v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
                if t.indirectvalue {
                    v = *((*unsafe.Pointer)(v))
                }
                return v
            }
        }
        b = b.overflow(t)
        if b == nil {
            return unsafe.Pointer(&zeroVal[0])
        }
    }
}

还有一个 mapaccess2 不知道是干啥的。 mapaccessK 说是给iter用的。 接下来看看赋值操作,赋值操作其实就是先查找,找到了覆盖,没找到新建:

// Like mapaccess, but allocates a slot for the key if it is not present in the map.
func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    if h == nil {
        panic(plainError("assignment to entry in nil map"))
    }
    if raceenabled {
        callerpc := getcallerpc(unsafe.Pointer(&t))
        pc := funcPC(mapassign)
        racewritepc(unsafe.Pointer(h), callerpc, pc)
        raceReadObjectPC(t.key, key, callerpc, pc)
    }
    if msanenabled {
        msanread(key, t.key.size)
    }
    if h.flags&hashWriting != 0 {
        throw("concurrent map writes")
    }
    alg := t.key.alg
    hash := alg.hash(key, uintptr(h.hash0))

    // Set hashWriting after calling alg.hash, since alg.hash may panic,
    // in which case we have not actually done a write.
    h.flags |= hashWriting

    if h.buckets == nil {
        h.buckets = newarray(t.bucket, 1)
    }

again:
    bucket := hash & (uintptr(1)<<h.B - 1)
    if h.growing() {
        growWork(t, h, bucket)
    }
    b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
    top := uint8(hash >> (sys.PtrSize*8 - 8))
    if top < minTopHash {
        top += minTopHash
    }

    var inserti *uint8
    var insertk unsafe.Pointer
    var val unsafe.Pointer
    for {
        for i := uintptr(0); i < bucketCnt; i++ {
            if b.tophash[i] != top {
                if b.tophash[i] == empty && inserti == nil {
                    inserti = &b.tophash[i]
                    insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
                    val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
                }
                continue
            }
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            if t.indirectkey {
                k = *((*unsafe.Pointer)(k))
            }
            if !alg.equal(key, k) {
                continue
            }
            // 如果找到了
            // already have a mapping for key. Update it.
            if t.needkeyupdate {
                typedmemmove(t.key, k, key)
            }
            val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
            goto done
        }
        ovf := b.overflow(t)
        if ovf == nil {
            break
        }
        b = ovf
    }

    // 遍历完了,没找到,那就新建,前面记好了是否有空余处可以插入
    // Did not find mapping for key. Allocate new cell & add entry.

    // If we hit the max load factor or we have too many overflow buckets,
    // and we're not already in the middle of growing, start growing.
    if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
        hashGrow(t, h)
        goto again // Growing the table invalidates everything, so try again
    }

    if inserti == nil {
        // all current buckets are full, allocate a new one.
        newb := h.newoverflow(t, b)
        inserti = &newb.tophash[0]
        insertk = add(unsafe.Pointer(newb), dataOffset)
        val = add(insertk, bucketCnt*uintptr(t.keysize))
    }

    // store new key/value at insert position
    if t.indirectkey {
        kmem := newobject(t.key)
        *(*unsafe.Pointer)(insertk) = kmem
        insertk = kmem
    }
    if t.indirectvalue {
        vmem := newobject(t.elem)
        *(*unsafe.Pointer)(val) = vmem
    }
    typedmemmove(t.key, insertk, key)
    *inserti = top
    h.count++

done:
    if h.flags&hashWriting == 0 {
        throw("concurrent map writes")
    }
    h.flags &^= hashWriting
    if t.indirectvalue {
        val = *((*unsafe.Pointer)(val))
    }
    return val
}

删除操作操作也是类似,先找,找到了删除,没找到就没动作:

func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
    if raceenabled && h != nil {
        callerpc := getcallerpc(unsafe.Pointer(&t))
        pc := funcPC(mapdelete)
        racewritepc(unsafe.Pointer(h), callerpc, pc)
        raceReadObjectPC(t.key, key, callerpc, pc)
    }
    if msanenabled && h != nil {
        msanread(key, t.key.size)
    }
    if h == nil || h.count == 0 {
        return
    }
    if h.flags&hashWriting != 0 {
        throw("concurrent map writes")
    }

    alg := t.key.alg
    hash := alg.hash(key, uintptr(h.hash0))

    // Set hashWriting after calling alg.hash, since alg.hash may panic,
    // in which case we have not actually done a write (delete).
    h.flags |= hashWriting

    bucket := hash & (uintptr(1)<<h.B - 1)
    if h.growing() {
        growWork(t, h, bucket)
    }
    b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
    top := uint8(hash >> (sys.PtrSize*8 - 8))
    if top < minTopHash {
        top += minTopHash
    }
    for {
        for i := uintptr(0); i < bucketCnt; i++ {
            if b.tophash[i] != top {
                continue
            }
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            k2 := k
            if t.indirectkey {
                k2 = *((*unsafe.Pointer)(k2))
            }
            if !alg.equal(key, k2) {
                continue
            }
            if t.indirectkey {
                *(*unsafe.Pointer)(k) = nil
            } else {
                typedmemclr(t.key, k)
            }
            v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*uintptr(t.keysize) + i*uintptr(t.valuesize))
            if t.indirectvalue {
                *(*unsafe.Pointer)(v) = nil
            } else {
                typedmemclr(t.elem, v)
            }
            // 标记为空
            b.tophash[i] = empty
            h.count--
            goto done
        }
        b = b.overflow(t)
        if b == nil {
            goto done
        }
    }

done:
    if h.flags&hashWriting == 0 {
        throw("concurrent map writes")
    }
    h.flags &^= hashWriting
}

综上,我们发现,Golang的map是不能并发读写的,只是简单的依靠设置flag, 然后检查并且 throw 来完成的,太不安全了,所以有了 sync.Map 可以愉快的并发读写了。


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK