高并发内存池(1)

2025-01-03

框架设计

现代很多的开发环境都是多核多线程，在申请内存的场景下，必然存在激烈的锁竞争问题。malloc本身其实已经很优秀，那么我们项目的原型tcmalloc就是在多线程高并发的场景下更胜一筹，所以这次我们实现的内存池需要考虑以下几方面的问题。

性能问题。
多线程环境下，锁竞争问题。
内存碎片问题。

thread cache，线程缓存，小于258kb，每个线程独享一个cache，线程从这里申请空间不需要加锁

central cache，中心缓存所有线程共享的，thread cache 从这里获取对象。但是这里加锁是桶锁，锁i竞争不是很激烈。均衡调度，会从thread cache 回收内存。

page cache，页缓存，是在central cache 之上的一层缓存，一页是4K或8K。当一个span的几个跨度页的对象都被回收之后，page cache 会回收central cache 满足条件的span对象，合并相邻的页，组成更大的页，缓解内存碎片问题

thread cache

由于这里的内存不是定长的，自由链表的设计就会有些麻烦

假设有一个自由链表，其中的内存块大小从很小的字节数到很大的字节数都有。当程序请求分配一个较小的内存块时，需要遍历整个自由链表去查找大小合适的内存块

所以我们设计成哈希桶结构，每个桶按位置映射大小的内存对象的自由链表，8 - 16 - 24，这就是导致内内存碎片。 外内存碎片是一些空闲的连续内存区域大小，但是空间不连续，合计内存足够但是不能满足一些内存分配要求。

先设计一个公共头文件，实现一下FreeList类。

#pragma once
#include <iostream>
#include <vector>
#include <time.h>
#include <assert.h>
using std::cout;
using std::endl;

void*& NextObj(void* obj) {
	return *(void**)obj;
}

// 管理切分好的小对象的自由链表
class FreeList {
public:
	void Push(void* obj) {
		assert(obj);
		//头插
		NextObj(obj) = _freeList;
		_freeList = obj;
	}
	void* Pop() {
		void* obj = _freeList;
		_freeList = NextObj(obj);
	}
private:
	void* _freeList;
};

设计思想

// 整体控制在最多10%左右的内碎片浪费
 // [1,128] 8byte对齐       freelist[0,16)
 // [128+1,1024] 16byte对齐   freelist[16,72)
 // [1024+1,8*1024] 128byte对齐   freelist[72,128)
 // [8*1024+1,64*1024] 1024byte对齐     freelist[128,184)
 // [64*1024+1,256*1024] 8*1024byte对齐   freelist[184,208)

对齐代码

//朴实的方法
//size_t _RoundUp(size_t size, size_t AlignNum) {
//	size_t alignSize;
//	if (size % AlignNum) {
//		alignSize = (size / AlignNum + 1) * AlignNum;
//	}
//	else {
//		alignSize = AlignNum;
//	}
//	return alignSize;
//}

//巧妙的方法， 找规律 位运算。
// 这个要用的频率较大，计算机对位运算的速度更快，处于对性能的考虑
static inline size_t _RoundUp(size_t bytes, size_t AlignNum) {
    return ((bytes + AlignNum - 1) & ~(AlignNum - 1));
}

static size_t RoundUp(size_t size) {
    if (size <= 128) {
        return _RoundUp(size, 8);
    }
    else if (size <= 1024) {
        return _RoundUp(size, 16);
    }
    else if (size <= 8 * 1024) {
        return _RoundUp(size, 128);
    }
    else if (size <= 64 * 1024) {
        return _RoundUp(size, 1024);
    }
    else if (size <= 128 * 1024) {
        return _RoundUp(size, 8 * 1024);
    }
    else {
        return 0;
    }
}

获取下标代码

static inline size_t _Index(size_t bytes, size_t align_shift)
{
    return ((bytes + (1 << align_shift) - 1) >> align_shift) - 1;
}
static inline size_t Index(size_t bytes)
{
    assert(bytes <= MAX_BYTES);
    // 每个区间有多少个链
    static int group_array[4] = { 16, 56, 56, 56 };
    if (bytes <= 128) {
        return _Index(bytes, 3);
    }
    else if (bytes <= 1024) {
        return _Index(bytes - 128, 4) + group_array[0];
    }
    else if (bytes <= 8 * 1024) {
        return _Index(bytes - 1024, 7) + group_array[1] + group_array[0];
    }
    else if (bytes <= 64 * 1024) {
        return _Index(bytes - 8 * 1024, 10) + group_array[2] + group_array[1]
            + group_array[0];
    }
    else if (bytes <= 256 * 1024) {
        return _Index(bytes - 64 * 1024, 13) + group_array[3] +
            group_array[2] + group_array[1] + group_array[0];
    }
    else {
        assert(false);
    }
    return -1;
}

就可以使用了

void* ThreadCache::Allocate(size_t size) {
	assert(size <= MAX_BYTES);
	size_t alignSize = SizeClass::RoundUp(size);
	size_t index = SizeClass::Index(size);

	if (!_freeLists[index].Empty()) {
		return _freeLists[index].Pop();
	}
	// 如果为空，就需要想central cache 申请内存里
	else {
		return FetchFromCentralCache(index, alignSize);
	}
}

还要保证每个线程独享一个cache，就不用考虑锁的问题了

1
2
3

// TLS thread local storage
static _declspec(thread) ThreadCache* pTLSThreadCache = nullptr;

// 判断是否已经存在TLS
static void* ConcurrentAlloc(size_t size) {
	if (pTLSThreadCache == nullptr) {
		pTLSThreadCache = new ThreadCache;
	}
    //打印线程号和线程id 验证（每个线程独享一个cache）
    cout << std::this_thread::get_id() << ":" << pTLSThreadCache << endl;
	return pTLSThreadCache->Allocate(size);
}

阶段代码完成

Common.h

#pragma once
#include <iostream>
#include <vector>
#include <time.h>
#include <assert.h>
#include <thread>
using std::cout;
using std::endl;
static const size_t MAX_BYTES = 256 * 1024;
static const size_t NFREELIST = 208;

inline void*& NextObj(void* obj) {
	return *((void**)obj);
}

// 管理切分好的小对象的自由链表
class FreeList {
public:
	void Push(void* obj) {
		assert(obj);
		//头插
		NextObj(obj) = _freeList;
		_freeList = obj;
	}
	void* Pop() {
		void* obj = _freeList;
		_freeList = NextObj(obj);

		return obj;
	}

	bool Empty() {
		return _freeList == nullptr;
	}
private:
	void* _freeList = nullptr;
};

//j计算对象大小对齐规则
// 整体控制在最多10%左右的内碎片浪费
// [1,128] 8byte对齐       freelist[0,16)
// [128+1,1024] 16byte对齐   freelist[16,72)
// [1024+1,8*1024] 128byte对齐   freelist[72,128)
// [8*1024+1,64*1024] 1024byte对齐     freelist[128,184)
// [64*1024+1,256*1024] 8*1024byte对齐   freelist[184,208)
class SizeClass {
public:
	//size_t _RoundUp(size_t size, size_t AlignNum) {
	//	size_t alignSize;
	//	if (size % AlignNum) {
	//		alignSize = (size / AlignNum + 1) * AlignNum;
	//	}
	//	else {
	//		alignSize = AlignNum;
	//	}
	//	return alignSize;
	//}
	
	//巧妙的方法
	static inline size_t _RoundUp(size_t bytes, size_t AlignNum) {
		return ((bytes + AlignNum - 1) & ~(AlignNum - 1));
	}

	static size_t RoundUp(size_t size) {
		if (size <= 128) {
			return _RoundUp(size, 8);
		}
		else if (size <= 1024) {
			return _RoundUp(size, 16);
		}
		else if (size <= 8 * 1024) {
			return _RoundUp(size, 128);
		}
		else if (size <= 64 * 1024) {
			return _RoundUp(size, 1024);
		}
		else if (size <= 128 * 1024) {
			return _RoundUp(size, 8 * 1024);
		}
		else {
			return 0;
		}
	}

	static inline size_t _Index(size_t bytes, size_t align_shift)
	{
		return ((bytes + (1 << align_shift) - 1) >> align_shift) - 1;
	}
	static inline size_t Index(size_t bytes)
	{
		assert(bytes <= MAX_BYTES);
		// 每个区间有多少个链
		static int group_array[4] = { 16, 56, 56, 56 };
		if (bytes <= 128) {
			return _Index(bytes, 3);
		}
		else if (bytes <= 1024) {
			return _Index(bytes - 128, 4) + group_array[0];
		}
		else if (bytes <= 8 * 1024) {
			return _Index(bytes - 1024, 7) + group_array[1] + group_array[0];
		}
		else if (bytes <= 64 * 1024) {
			return _Index(bytes - 8 * 1024, 10) + group_array[2] + group_array[1]
				+ group_array[0];
		}
		else if (bytes <= 256 * 1024) {
			return _Index(bytes - 64 * 1024, 13) + group_array[3] +
				group_array[2] + group_array[1] + group_array[0];
		}
		else {
			assert(false);
		}
		return -1;
	}
};

ConcurrentAlloc.h

#pragma once

#include "Common.h"
#include "ThreadCache.h"

static void* ConcurrentAlloc(size_t size) {
	if (pTLSThreadCache == nullptr) {
		pTLSThreadCache = new ThreadCache;
	}
	cout << std::this_thread::get_id() << ":" << pTLSThreadCache << endl;
	return pTLSThreadCache->Allocate(size);
}

static void ConcurrentFree(void* ptr) {

}

ThreadCache.h

#pragma once

#include "Common.h"

class ThreadCache {
public:

	void* Allocate(size_t size);

	void Deallocate(void* ptr, size_t size);

	void* FetchFromCentralCache(size_t index, size_t size);


private:
	FreeList _freeLists[NFREELIST];
};

// TLS thread local storage
static _declspec(thread) ThreadCache* pTLSThreadCache = nullptr;

Thread_Cache.cpp

#include "ThreadCache.h"

void* ThreadCache::Allocate(size_t size) {
	assert(size <= MAX_BYTES);
	size_t alignSize = SizeClass::RoundUp(size);
	size_t index = SizeClass::Index(size);

	if (!_freeLists[index].Empty()) {
		return _freeLists[index].Pop();
	}
	// 如果为空，就需要想central cache 申请内存里
	else {
		return FetchFromCentralCache(index, alignSize);
	}
}

void ThreadCache::Deallocate(void* ptr, size_t size) {
}


void* ThreadCache::FetchFromCentralCache(size_t index, size_t size) {
	return nullptr;
}

test.cpp

#include "ConcurrentAlloc.h"

void Alloc1() {
	for (size_t i = 0; i < 5; i++) {
		void* ptr = ConcurrentAlloc(6);
	}
}
void Alloc2() {
	for (size_t i = 0; i < 5; i++) {
		void* ptr = ConcurrentAlloc(7);
	}
}

void TLSTest() {
	std::thread t1(Alloc1);
	std::thread t2(Alloc1);
	t1.join();
	t2.join();
}

int main() {
	//TestObjectPool();
	TLSTest();
}