查看Ruby Under a Microscope的源代码

=== 2.4 st_table 阅读心得 ===
* st_features 定义了 table 的属性：
** entry_power  entries数组的大小，2的幂指数。
** bin_power     bins数组大小，同样是2的幂指数
** size_ind        根据 table 大小，选择 bins 对应的元素的大小，可能是  8-bit, 16-bit etc。
** bins_words   bins按照word计算的大小。
** 根据 SIZEOF_ST_INDEX_T 枚举了一堆 table 属性。
* 因为 bins 的大小都是 2 的次幂，因此计算哈希值对应的 bin 可以直接用位运算：
<pre>
/* Return mask for a bin index in table TAB.  */
static inline st_index_t
bins_mask(const st_table *tab)
{
    return get_bins_num(tab) - 1;
}

/* Return the index of table TAB bin corresponding to
   HASH_VALUE.  */
static inline st_index_t
hash_bin(st_hash_t hash_value, st_table *tab)
{
    return hash_value & bins_mask(tab);
}
</pre>
* 最小的  table 大小是 4， 由 MINIMAL_POWER2 决定。最大的  table 大小是 2 的 30 次方（非 8 位平台），8 位平台上是 2 的 62 次方。
* 对于小于等于 16 个元素的 table,不创建 bins 数组，直接存储在 entries 数组，线性探测，无需进行 hash 计算和查找。
* rebuild_table 可能有两种： compact 现有的，或者创建新的。当 entries_bound 到达上限的时候，开始 rebuild。
** 当已有 entries 数组的大小在现有元素大小的 2 倍到 4 倍(REBUILD_THRESHOLD)之间，或者元素数量小于 16 个，进入压缩流程,直接使用原来 table 作为新 new_tab
** 否则，进入新建 table 作为 new_tab
** 小技巧 , prefetch 指令，预加载下个元素，在遍历 entries  的时候用到。
<pre>
#define PREFETCH(addr, write_p) __builtin_prefetch(addr, write_p)
PREFETCH(entries + i + 1, 0);
</pre>
** rehash 其实很简单了，遍历已有的 entries 数组，跳过已经删除的，设置到新的 table 里，同时设置 bins：
<pre>
    bins = new_tab->bins;
    size_ind = get_size_ind(new_tab);
    for (i = tab->entries_start; i < bound; i++) {
        curr_entry_ptr = &entries[i];
	PREFETCH(entries + i + 1, 0);
	if (EXPECT(DELETED_ENTRY_P(curr_entry_ptr), 0))
	    continue;
	if (&new_entries[ni] != curr_entry_ptr)
	    new_entries[ni] = *curr_entry_ptr;
	if (EXPECT(bins != NULL, 1)) {
	    bin_ind = find_table_bin_ind_direct(new_tab, curr_entry_ptr->hash,
						curr_entry_ptr->key);
	    st_assert(bin_ind != UNDEFINED_BIN_IND
		      && (tab == new_tab || new_tab->rebuilds_num == 0)
		      && IND_EMPTY_BIN_P(new_tab, bin_ind));
	    set_bin(bins, size_ind, bin_ind, ni + ENTRY_BASE);
	}
	new_tab->num_entries++;
	ni++;
    }

</pre>

* 开放地址法，遇到哈希冲突，采用二次哈希，次级哈希函数如下：
<pre>
/* Return the next secondary hash index for table TAB using previous
   index IND and PERTERB.  Finally modulo of the function becomes a
   full *cycle linear congruential generator*, in other words it
   guarantees traversing all table bins in extreme case.

   According the Hull-Dobell theorem a generator
   "Xnext = (a*Xprev + c) mod m" is a full cycle generator iff
     o m and c are relatively prime
     o a-1 is divisible by all prime factors of m
     o a-1 is divisible by 4 if m is divisible by 4.

   For our case a is 5, c is 1, and m is a power of two.  */
static inline st_index_t
secondary_hash(st_index_t ind, st_table *tab, st_index_t *perterb)
{
    *perterb >>= 11;
    ind = (ind << 2) + ind + *perterb + 1;
    return hash_bin(ind, tab);
}

</pre>

* find_entry 线性探测，find_table_entry_ind 是二次哈希查找，但是可以选择是否启用二次探测：
<pre>
/* Use the quadratic probing.  The method has a better data locality
   but more collisions than the current approach.  In average it
   results in a bit slower search.  */
/*#define QUADRATIC_PROBE*/

    ind = hash_bin(hash_value, tab);
#ifdef QUADRATIC_PROBE
    d = 1;
#else
    peterb = hash_value;
#endif
    FOUND_BIN;
    for (;;) {
        bin = get_bin(tab->bins, get_size_ind(tab), ind);
        //找到相等的。
        if (! EMPTY_OR_DELETED_BIN_P(bin)
            && PTR_EQUAL(tab, &entries[bin - ENTRY_BASE], hash_value, key))
            break;
        //或者找到空的。
        else if (EMPTY_BIN_P(bin))
            return UNDEFINED_ENTRY_IND;
#ifdef QUADRATIC_PROBE
        //启用了二次探测，计算下一个探测位置。
	ind = hash_bin(ind + d, tab);
	d++;
#else
        //或者二次哈希
        ind = secondary_hash(ind, tab, &peterb);
#endif
        COLLISION;
    }
    return bin;
</pre>

默认采用二次哈希。

* 查找过程简单明了：

<pre>
/* Find an entry with KEY in table TAB.  Return non-zero if we found
   it.  Set up *RECORD to the found entry record.  */
int
st_lookup(st_table *tab, st_data_t key, st_data_t *value)
{
    st_index_t bin;
   //计算哈希
    st_hash_t hash = do_hash(key, tab);

    if (tab->bins == NULL) {
        //线性查找，对于少于等于 16 个元素的table
        bin = find_entry(tab, hash, key);
	if (bin == UNDEFINED_ENTRY_IND)
	    return 0;
    } else {
        //进入哈希查找。
        bin = find_table_entry_ind(tab, hash, key);
	if (bin == UNDEFINED_ENTRY_IND)
	    return 0;
	bin -= ENTRY_BASE;
    }
    //赋值，返回
    if (value != 0)
        *value = tab->entries[bin].record;
    return 1;
}

</pre>

* 删除是标记删除，开放地址法必须这么做，否则冲突的时候，前面的删除，就找不到后面的元素了：
<pre>
	bin = get_bin(tab->bins, get_size_ind(tab), bin_ind) - ENTRY_BASE;
	MARK_BIN_DELETED(tab, bin_ind);
    }
    entry = &tab->entries[bin];
    *key = entry->key;
    if (value != 0) *value = entry->record;
    MARK_ENTRY_DELETED(entry);
</pre>