最近在一個(gè)模塊里面使用了boost::pool<>,結(jié)果卻遇到嚴(yán)重的性能問題:
#include <boost/pool/pool.hpp>
class Allocator
{
public:
Allocator()
: m_pool(1)
{
}
void* Allocate(size_t size)
{
return m_pool.ordered_malloc(size);
}
void Free(void* ptr)
{
m_pool.ordered_free(ptr);
}
private:
boost::pool<> m_pool;
};
在我們的使用場(chǎng)景中,每次會(huì)請(qǐng)求32B~1024B不等的內(nèi)存塊,通過調(diào)試發(fā)現(xiàn),malloc_n()和try_malloc_n()中關(guān)于尋找n塊連續(xù)chunk的邏輯會(huì)消耗大量的時(shí)間,導(dǎo)致性能瓶頸。
template <typename UserAllocator>
void * pool<UserAllocator>::ordered_malloc(const size_type n)
{
...
// 瓶頸所在
void * ret = store().malloc_n(num_chunks, partition_size);
...
}
template <typename SizeType>
void * simple_segregated_storage<SizeType>::malloc_n(const size_type n, const size_type partition_size)
{
BOOST_POOL_VALIDATE_INTERNALS
if(n == 0)
return 0;
void * start = &first;
void * iter;
do
{
if (nextof(start) == 0)
return 0;
iter = try_malloc_n(start, n, partition_size);
} while (iter == 0);
void * const ret = nextof(start);
nextof(start) = nextof(iter);
BOOST_POOL_VALIDATE_INTERNALS
return ret;
}
template <typename SizeType>
void * simple_segregated_storage<SizeType>::try_malloc_n(void * & start, size_type n, const size_type partition_size)
{
void * iter = nextof(start);
while (--n != 0)
{
void * next = nextof(iter);
if (next != static_cast<char *>(iter) + partition_size)
{
// next == 0 (end-of-list) or non-contiguous chunk found
start = iter;
return 0;
}
iter = next;
}
return iter;
}
由于我們?cè)跇?gòu)造m_pool的時(shí)候指定的“chunk size”為1,所以懷疑是不是指定的“chunk size”過小導(dǎo)致了大量的內(nèi)存碎片,使得每次尋找連續(xù)chunk的時(shí)候效率過低,于是考慮調(diào)大指定的“chunk size”:
class Allocator
{
public:
Allocator()
: m_pool(CHUNK_SIZE)
{
}
void* Allocate(size_t size)
{
const size_t numChunks = (size + CHUNK_SIZE - 1) / CHUNK_SIZE;
return m_pool.ordered_malloc(numChunks);
}
void Free(void* ptr)
{
m_pool.ordered_free(ptr);
}
private:
// 指定“chunk size”為1024
static const uint32_t CHUNK_SIZE = 1024;
boost::pool<> m_pool;
};
最后發(fā)現(xiàn)將“chunk size”改為1024,可以解決困擾我們的性能問題,于是打住,便沒有再繼續(xù)深究下去。