框架设计
现代很多的开发环境都是多核多线程,在申请内存的场景下,必然存在激烈的锁竞争问题。malloc本身 其实已经很优秀,那么我们项目的原型tcmalloc就是在多线程高并发的场景下更胜一筹,所以这次我们 实现的内存池需要考虑以下几方面的问题。
- 性能问题。
- 多线程环境下,锁竞争问题。
- 内存碎片问题。
thread cache, 线程缓存, 小于258kb,每个线程独享一个cache, 线程从这里申请空间不需要加锁
central cache,中心缓存所有线程共享的,thread cache 从这里获取对象。但是这里加锁是桶锁,锁i竞争不是很激烈。 均衡调度,会从thread cache 回收内存。
page cache, 页缓存,是在central cache 之上的一层缓存,一页是4K或8K。 当一个span的几个跨度页的对象都被回收之后,page cache 会回收central cache 满足条件的span对象,合并相邻的页,组成更大的页,缓解内存碎片问题
thread cache
由于这里的内存不是定长的,自由链表的设计就会有些麻烦
假设有一个自由链表,其中的内存块大小从很小的字节数到很大的字节数都有。当程序请求分配一个较小的内存块时,需要遍历整个自由链表去查找大小合适的内存块
所以我们设计成哈希桶结构,每个桶按位置映射大小的内存对象的自由链表,8 - 16 - 24, 这就是导致内内存碎片。 外内存碎片是一些空闲的连续内存区域大小,但是空间不连续,合计内存足够但是不能满足一些内存分配要求。
先设计一个公共头文件,实现一下FreeList类。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| #pragma once #include <iostream> #include <vector> #include <time.h> #include <assert.h> using std::cout; using std::endl;
void*& NextObj(void* obj) { return *(void**)obj; }
class FreeList { public: void Push(void* obj) { assert(obj); NextObj(obj) = _freeList; _freeList = obj; } void* Pop() { void* obj = _freeList; _freeList = NextObj(obj); } private: void* _freeList; };
|
设计思想
对齐代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
|
static inline size_t _RoundUp(size_t bytes, size_t AlignNum) { return ((bytes + AlignNum - 1) & ~(AlignNum - 1)); }
static size_t RoundUp(size_t size) { if (size <= 128) { return _RoundUp(size, 8); } else if (size <= 1024) { return _RoundUp(size, 16); } else if (size <= 8 * 1024) { return _RoundUp(size, 128); } else if (size <= 64 * 1024) { return _RoundUp(size, 1024); } else if (size <= 128 * 1024) { return _RoundUp(size, 8 * 1024); } else { return 0; } }
|
获取下标代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| static inline size_t _Index(size_t bytes, size_t align_shift) { return ((bytes + (1 << align_shift) - 1) >> align_shift) - 1; } static inline size_t Index(size_t bytes) { assert(bytes <= MAX_BYTES); static int group_array[4] = { 16, 56, 56, 56 }; if (bytes <= 128) { return _Index(bytes, 3); } else if (bytes <= 1024) { return _Index(bytes - 128, 4) + group_array[0]; } else if (bytes <= 8 * 1024) { return _Index(bytes - 1024, 7) + group_array[1] + group_array[0]; } else if (bytes <= 64 * 1024) { return _Index(bytes - 8 * 1024, 10) + group_array[2] + group_array[1] + group_array[0]; } else if (bytes <= 256 * 1024) { return _Index(bytes - 64 * 1024, 13) + group_array[3] + group_array[2] + group_array[1] + group_array[0]; } else { assert(false); } return -1; }
|
就可以使用了
1 2 3 4 5 6 7 8 9 10 11 12 13
| void* ThreadCache::Allocate(size_t size) { assert(size <= MAX_BYTES); size_t alignSize = SizeClass::RoundUp(size); size_t index = SizeClass::Index(size);
if (!_freeLists[index].Empty()) { return _freeLists[index].Pop(); } else { return FetchFromCentralCache(index, alignSize); } }
|
还要保证每个线程独享一个cache,就不用考虑锁的问题了
1 2 3
| static _declspec(thread) ThreadCache* pTLSThreadCache = nullptr;
|
1 2 3 4 5 6 7 8 9
| static void* ConcurrentAlloc(size_t size) { if (pTLSThreadCache == nullptr) { pTLSThreadCache = new ThreadCache; } cout << std::this_thread::get_id() << ":" << pTLSThreadCache << endl; return pTLSThreadCache->Allocate(size); }
|
阶段代码完成
Common.h
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
| #pragma once #include <iostream> #include <vector> #include <time.h> #include <assert.h> #include <thread> using std::cout; using std::endl; static const size_t MAX_BYTES = 256 * 1024; static const size_t NFREELIST = 208;
inline void*& NextObj(void* obj) { return *((void**)obj); }
class FreeList { public: void Push(void* obj) { assert(obj); NextObj(obj) = _freeList; _freeList = obj; } void* Pop() { void* obj = _freeList; _freeList = NextObj(obj);
return obj; }
bool Empty() { return _freeList == nullptr; } private: void* _freeList = nullptr; };
class SizeClass { public: static inline size_t _RoundUp(size_t bytes, size_t AlignNum) { return ((bytes + AlignNum - 1) & ~(AlignNum - 1)); }
static size_t RoundUp(size_t size) { if (size <= 128) { return _RoundUp(size, 8); } else if (size <= 1024) { return _RoundUp(size, 16); } else if (size <= 8 * 1024) { return _RoundUp(size, 128); } else if (size <= 64 * 1024) { return _RoundUp(size, 1024); } else if (size <= 128 * 1024) { return _RoundUp(size, 8 * 1024); } else { return 0; } }
static inline size_t _Index(size_t bytes, size_t align_shift) { return ((bytes + (1 << align_shift) - 1) >> align_shift) - 1; } static inline size_t Index(size_t bytes) { assert(bytes <= MAX_BYTES); static int group_array[4] = { 16, 56, 56, 56 }; if (bytes <= 128) { return _Index(bytes, 3); } else if (bytes <= 1024) { return _Index(bytes - 128, 4) + group_array[0]; } else if (bytes <= 8 * 1024) { return _Index(bytes - 1024, 7) + group_array[1] + group_array[0]; } else if (bytes <= 64 * 1024) { return _Index(bytes - 8 * 1024, 10) + group_array[2] + group_array[1] + group_array[0]; } else if (bytes <= 256 * 1024) { return _Index(bytes - 64 * 1024, 13) + group_array[3] + group_array[2] + group_array[1] + group_array[0]; } else { assert(false); } return -1; } };
|
ConcurrentAlloc.h
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| #pragma once
#include "Common.h" #include "ThreadCache.h"
static void* ConcurrentAlloc(size_t size) { if (pTLSThreadCache == nullptr) { pTLSThreadCache = new ThreadCache; } cout << std::this_thread::get_id() << ":" << pTLSThreadCache << endl; return pTLSThreadCache->Allocate(size); }
static void ConcurrentFree(void* ptr) {
}
|
ThreadCache.h
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| #pragma once
#include "Common.h"
class ThreadCache { public:
void* Allocate(size_t size);
void Deallocate(void* ptr, size_t size);
void* FetchFromCentralCache(size_t index, size_t size);
private: FreeList _freeLists[NFREELIST]; };
static _declspec(thread) ThreadCache* pTLSThreadCache = nullptr;
|
Thread_Cache.cpp
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| #include "ThreadCache.h"
void* ThreadCache::Allocate(size_t size) { assert(size <= MAX_BYTES); size_t alignSize = SizeClass::RoundUp(size); size_t index = SizeClass::Index(size);
if (!_freeLists[index].Empty()) { return _freeLists[index].Pop(); } // 如果为空,就需要想central cache 申请内存里 else { return FetchFromCentralCache(index, alignSize); } }
void ThreadCache::Deallocate(void* ptr, size_t size) { }
void* ThreadCache::FetchFromCentralCache(size_t index, size_t size) { return nullptr; }
|
test.cpp
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| #include "ConcurrentAlloc.h"
void Alloc1() { for (size_t i = 0; i < 5; i++) { void* ptr = ConcurrentAlloc(6); } } void Alloc2() { for (size_t i = 0; i < 5; i++) { void* ptr = ConcurrentAlloc(7); } }
void TLSTest() { std::thread t1(Alloc1); std::thread t2(Alloc1); t1.join(); t2.join(); }
int main() { TLSTest(); }
|