h1

ultra performance go!!!!!

November 3, 2008

This is awesome!!!  I just now implemented a memory block manager for all the stuff that gets allocated / deallocated hundreds if not thousands of times an iteration, which essentially recycles all blocks of that size’s cache deallocated until cleared.  There is a list of optimal sizes which all go evenly in to the chunks, which are propagated on out of free blocks.  It is basically just never deallocating anything ’till the end, but since we recycle everything, there are no leaks and it’s faster.  Twenty times faster, according to Monsieur Framerate, and we don’t argue with him.  If you want the full code for all my memory management used in Rat Physics, I’m going to put up articles about it.  But in the meantime, here’s the plain ol’ code.  But wait, credit where due: Erin Catto is responsible for his idea to use both stack allocators and block recyclers in Box2D.  That is where I directly ripped the idea off from.  The code, however, is my own.  Here it is now:

memory.h

#ifndef RAT_PHYSICS_MEMORY
#define RAT_PHYSICS_MEMORY

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

// main alloc/dealloc functions, just calls dlmalloc|malloc/dlfree|free

void rat_set_alloc_fn(void *(*alloc_fn)(size_t));
void rat_set_realloc_fn(void *(*realloc_fn)(void *,size_t));
void rat_set_dealloc_fn(void (*dealloc_fn)(void *));

void *rat_alloc(size_t size);
void *rat_realloc(void *mem,size_t newsize);
void rat_dealloc(void *mem);

unsigned int rat_blocks_alloced();
unsigned int rat_mem_alloced();

void rat_mem_alloced_string(char *str);


// a simple stack allocator
// must push/pop memory allocations; faster than
// arbitrary malloc/free by a int shot.

struct rat_stack_allocator;
typedef struct rat_stack_allocator rat_stack_allocator;

extern const size_t rat_max_stack_cells;
extern const size_t rat_stack_size;

rat_stack_allocator *rat_stack_allocator_create();
void rat_stack_allocator_destroy(rat_stack_allocator *sal);

void *rat_stack_alloc(rat_stack_allocator *sal,size_t size);
void rat_stack_dealloc(rat_stack_allocator *sal,void *mem);

unsigned int rat_stack_most_alloced(rat_stack_allocator *sal);


// a simple small block allocator for objects of about
// the same size being alloced/dealloced on a regular
// basis.  it is very fast at recycling

struct rat_block_allocator;
typedef struct rat_block_allocator rat_block_allocator;

extern const unsigned int rat_chunk_size;
extern const unsigned int rat_max_block_size;
extern const unsigned int rat_block_sizes;
extern const unsigned int rat_chunk_arr_inc;

rat_block_allocator *rat_block_allocator_create();
void rat_block_allocator_destroy(rat_block_allocator *bal);
void rat_block_allocator_clear(rat_block_allocator *bal);

void *rat_block_alloc(rat_block_allocator *bal,size_t size);
void *rat_block_realloc(rat_block_allocator *bal,void *mem,size_t oldsize,size_t size);
void rat_block_dealloc(rat_block_allocator *bal,void *mem,size_t size);

#ifdef __cplusplus
}
#endif

#endif

memory.c

#include "common.h"

#define RAT_MEMORY_GUARD_WORD 0x52415453 // spells "RATS" in ascii 😛

#include "memory.h"
#ifndef RAT_PHYSICS_MSVC
#	include "dlmalloc.h"
#endif

static unsigned int blocksalloced=0;
static unsigned int amtmemalloced=0;
static unsigned int extrafootprint=0;

#ifdef RAT_PHYSICS_MSVC
	static void *(*rat_alloc_fn)(size_t)=malloc;
	static void *(*rat_realloc_fn)(void *,size_t)=realloc;
	static void (*rat_dealloc_fn)(void *)=free;
#else
	static void *(*rat_alloc_fn)(size_t)=dlmalloc;
	static void *(*rat_realloc_fn)(void *,size_t)=dlrealloc;
	static void (*rat_dealloc_fn)(void *)=dlfree;
#endif

void rat_set_alloc_fn(void *(*alloc_fn)(size_t))
{
	rat_alloc_fn=alloc_fn;
}

void rat_set_realloc_fn(void *(*realloc_fn)(void *,size_t))
{
	rat_realloc_fn=realloc_fn;
}

void rat_set_dealloc_fn(void (*dealloc_fn)(void *))
{
	rat_dealloc_fn=dealloc_fn;
}

void *rat_alloc(size_t size)
{
	size_t *ptr;
	void *mem=rat_alloc_fn(size+sizeof(size_t)*2);
	if (mem==NULL) return NULL;
	ptr=mem;

	ptr[0]=RAT_MEMORY_GUARD_WORD;
	ptr[1]=size;
	amtmemalloced+=size;

	blocksalloced++;
	extrafootprint=blocksalloced*sizeof(size_t)*2;
	return (void *)(ptr+2);
}

void *rat_realloc(void *mem,size_t newsize)
{
	size_t *ptr=mem;
	if (mem==NULL) return rat_alloc(newsize);
	if (ptr[-2]!=RAT_MEMORY_GUARD_WORD) return NULL;

	amtmemalloced+=newsize-ptr[-1];
	ptr=(size_t *)rat_realloc_fn((void *)(ptr-2),newsize+sizeof(size_t)*2);

	ptr[0]=RAT_MEMORY_GUARD_WORD;
	ptr[1]=newsize;
	return (void *)(ptr+2);
}

void rat_dealloc(void *mem)
{
	size_t *ptr=mem;
	if (mem==NULL) return;
	if (ptr[-2]!=RAT_MEMORY_GUARD_WORD) return;
	amtmemalloced-=ptr[-1];
	blocksalloced--;
	extrafootprint=blocksalloced*sizeof(size_t)*2;
	rat_dealloc_fn((void *)(ptr-2));
}

unsigned int rat_blocks_alloced()	{return blocksalloced;}
unsigned int rat_mem_alloced()		{return amtmemalloced+extrafootprint;}

void rat_mem_alloced_string(char *str)
{
	size_t malc=amtmemalloced+extrafootprint;
	if (malc<1024)
		sprintf(str,"%04u Bytes",malc);
	else if (malc>=1024&&malc<1048576)
	{
		rat_real kb=(float)malc/1024.0f;
		sprintf(str,"%04.4f KB",kb);
	}
	else if (malc>=1048576&&malc<1073741824)
	{
		rat_real mb=(float)malc/1048576.0f;
		sprintf(str,"%04.4f MB",mb);
	}
	else
		sprintf(str,"overflowed");
}

// stack allocator...

const size_t rat_max_stack_cells=128;	// 128 stack cells
const size_t rat_stack_size=512*1024;	// 512KB total memory

typedef struct rat_stack_cell
{
	char *data;
	size_t size;
	int nonstackalloc;
} rat_stack_cell;

struct rat_stack_allocator
{
	unsigned int idx;
	char *data;

	unsigned int alloced;
	unsigned int most_alloced;

	unsigned int numcells;
	rat_stack_cell *cells;
};

rat_stack_allocator *rat_stack_allocator_create()
{
	rat_stack_allocator *sal=(rat_stack_allocator *)rat_alloc_fn(sizeof(rat_stack_allocator));
	memset(sal,0,sizeof(rat_stack_allocator));

	sal->alloced=0;
	sal->most_alloced=0;
	sal->idx=0;
	sal->numcells=0;

	sal->data=(char *)rat_alloc_fn(rat_stack_size);
	sal->cells=(rat_stack_cell *)rat_alloc_fn(sizeof(rat_stack_cell)*rat_max_stack_cells);

	return sal;
}

void rat_stack_allocator_destroy(rat_stack_allocator *sal)
{
	if (sal->idx||sal->numcells)
	{
		fprintf(stderr,"stack destroyed with memory still allocated!!!\n");
		fflush(stderr);
	}

	rat_dealloc_fn((void *)sal->data);
	rat_dealloc_fn((void *)sal->cells);
	rat_dealloc_fn((void *)sal);
}

void *rat_stack_alloc(rat_stack_allocator *sal,size_t size)
{
	rat_stack_cell *cell;

	if (sal->numcells>=rat_max_stack_cells)
	{
		fprintf(stderr,"stack allocator overflow!!! only %u cells allowed.\n",rat_max_stack_cells);
		fflush(stderr);
		return NULL;
	}

	cell=sal->cells+sal->numcells;
	cell->size=size;
	if (sal->idx+size>rat_stack_size)
	{
		cell->data=(char *)rat_alloc_fn(size);
		cell->nonstackalloc=1; // we did not use memory from the stack
	}
	else
	{
		cell->data=sal->data+sal->idx;
		cell->nonstackalloc=0; // we used memory from the stack
		sal->idx+=cell->size;
	}

	sal->alloced+=size;
	sal->most_alloced=sal->alloced>sal->most_alloced?sal->alloced:sal->most_alloced;
	sal->numcells++;

	amtmemalloced+=size;
	blocksalloced++;

	return (void *)cell->data;
}

void rat_stack_dealloc(rat_stack_allocator *sal,void *mem)
{
	rat_stack_cell *cell;

	if (sal->numcells==0)
	{
		fprintf(stderr,"there's nothing left to deallocate!!!\n");
		fflush(stderr);
		return;
	}

	cell=sal->cells+sal->numcells-1;
	if (mem!=(void *)cell->data)
	{
		fprintf(stderr,
			"memory at 0x%08x is not at top of stack; deallocate fail! 0x%08x is next.\n",
			(unsigned int)mem,(unsigned int)cell->data);
		fflush(stderr);
		return;
	}

	if (cell->nonstackalloc)
		rat_dealloc_fn((void *)mem);
	else
		sal->idx-=cell->size;

	sal->alloced-=cell->size;
	sal->numcells--;

	amtmemalloced-=cell->size;
	blocksalloced--;
}

unsigned int rat_stack_most_alloced(rat_stack_allocator *sal)
{
	return sal->most_alloced;
}

// block allocator...

const unsigned int rat_chunk_size=32768;
const unsigned int rat_max_block_size=16384;
const unsigned int rat_block_sizes=20;
const unsigned int rat_chunk_arr_inc=128;

typedef struct rat_block
{
	struct rat_block *next;
} rat_block;

typedef struct rat_chunk
{
	unsigned int block_size;
	rat_block *blocks;
} rat_chunk;

static unsigned int block_sizes[]={16,32,64,96,128,160,192,224,256,320,384,448,512,640,1024,2048,4096,8192,16384};

static unsigned char *block_size_lookup;
static int is_block_size_lookup_init=0;

struct rat_block_allocator
{
	unsigned int numchunks;
	rat_chunk *chunks;

	unsigned int chunk_space;
	rat_block **recycling_lists;
};

void free_bsl(void)	// we use (void) to conform to that retarded standard
					// and get rid of the associated warning message
{
	free((void *)block_size_lookup);
}

rat_block_allocator *rat_block_allocator_create()
{
	rat_block_allocator *bal=(rat_block_allocator *)rat_alloc(sizeof(rat_block_allocator));
	memset(bal,0,sizeof(rat_block_allocator));

	bal->chunk_space=rat_chunk_arr_inc;
	
	bal->numchunks=0;
	bal->chunks=(rat_chunk *)rat_alloc(sizeof(rat_chunk)*bal->chunk_space);
	bal->recycling_lists=(rat_block **)rat_alloc(sizeof(rat_block *)*rat_block_sizes);
	
	memset(bal->chunks,0,sizeof(rat_chunk)*bal->chunk_space);
	memset(bal->recycling_lists,0,sizeof(rat_block *)*rat_block_sizes);

	if (!is_block_size_lookup_init)
	{
		register unsigned int i,j=0;
		block_size_lookup=(char *)malloc(rat_max_block_size+1);
		atexit(free_bsl);
		
		for (i=1; i<rat_max_block_size+1; i++)
		{
			if (j>=rat_block_sizes)
			{
				fprintf(stderr,"blocks size list exceeds block sizes!\n");
				fflush(stderr);
			}
			if (i<=block_sizes&#91;j&#93;)
				block_size_lookup&#91;i&#93;=(unsigned char)j;
			else
				block_size_lookup&#91;i&#93;=(unsigned char)++j;
		}
		is_block_size_lookup_init=1;
	}

	return bal;
}

void rat_block_allocator_destroy(rat_block_allocator *bal)
{
	register unsigned int i;

	for (i=0; i<bal->numchunks; i++)
		rat_dealloc((void *)bal->chunks[i].blocks);
	
	rat_dealloc((void *)bal->chunks);
	rat_dealloc((void *)bal->recycling_lists);
	rat_dealloc((void *)bal);
}

void rat_block_allocator_clear(rat_block_allocator *bal)
{
	register unsigned int i;

	for (i=0; i<bal->numchunks; i++)
		rat_dealloc((void *)bal->chunks[i].blocks);
	bal->numchunks=0;

	memset(bal->chunks,0,sizeof(rat_chunk)*bal->chunk_space);
	memset(bal->recycling_lists,0,sizeof(rat_block *)*rat_block_sizes);
}

void *rat_block_alloc(rat_block_allocator *bal,size_t size)
{
	register unsigned int i,idx;

	if (size==0) return NULL;
	if (size>rat_max_block_size)
	{
		fprintf(stderr,"this block is too fucking big! you can't allocate more than %u bytes!\n",rat_max_block_size);
		fflush(stderr);
		return NULL;
	}

	idx=block_size_lookup[size];
	if (idx<0||rat_block_sizes<idx)
	{
		fprintf(stderr,"block size lookup failed!\n");
		fflush(stderr);
		return NULL;
	}

	if (bal->recycling_lists[idx]) // this is our fast memory recycler!!! look at that ultra speed!!!
	{
		rat_block *block=bal->recycling_lists[idx];
		bal->recycling_lists[idx]=block->next;

		return block;
	}
	else
	{
		register unsigned int block_size,numblocks;
		rat_chunk *chunk;
		rat_block *last;

		if (bal->numchunks==bal->chunk_space)
		{
			rat_chunk *old_chunks=bal->chunks;

			bal->chunk_space+=rat_chunk_arr_inc;
			bal->chunks=(rat_chunk *)rat_alloc(sizeof(rat_chunk)*bal->chunk_space);

			memcpy(bal->chunks,old_chunks,sizeof(rat_chunk)*bal->numchunks);
			memset(bal->chunks+bal->numchunks,0,sizeof(rat_chunk)*rat_chunk_arr_inc);
			rat_dealloc((void *)old_chunks);
		}

		chunk=bal->chunks+bal->numchunks;
		chunk->blocks=(rat_block *)rat_alloc(rat_chunk_size);

		block_size=block_sizes[idx];
		chunk->block_size=block_size;
		numblocks=rat_chunk_size/block_size;

		if (numblocks*block_size>rat_chunk_size)
		{
			fprintf(stderr,"internal memory block listing error!\n");
			fflush(stderr);
			return NULL;
		}

		for (i=0; i<numblocks-1; i++)
		{
			rat_block *thisblock=(rat_block *)((char *)chunk->blocks+block_size*i);
			rat_block *nextblock=(rat_block *)((char *)chunk->blocks+block_size*(i+1));
			thisblock->next=nextblock;
		}
		last=(rat_block *)((char *)chunk->blocks+block_size*(numblocks-1));
		last->next=NULL;

		bal->recycling_lists[idx]=chunk->blocks->next;
		bal->numchunks++;

		return chunk->blocks;
	}
}

void *rat_block_realloc(rat_block_allocator *bal,void *mem,size_t oldsize,size_t size)
{
	void *newmem;
	size_t roldsize=block_sizes[block_size_lookup[oldsize]];
	size_t rsize=block_sizes[block_size_lookup[size]];

	if (rsize==roldsize)
		return mem;
	else if (rsize>roldsize)
	{
		newmem=rat_block_alloc(bal,size);
		memcpy(newmem,mem,roldsize);
		rat_block_dealloc(bal,mem,oldsize);
	}
	else
	{
		newmem=rat_block_alloc(bal,size);
		memcpy(newmem,mem,rsize);
		rat_block_dealloc(bal,mem,oldsize);
	}

	return newmem;
}

void rat_block_dealloc(rat_block_allocator *bal,void *mem,size_t size)
{
	register unsigned int idx;
	rat_block *block;

	if (size==0||mem==NULL) return;
	if (size<0||size>rat_max_block_size)
	{
		fprintf(stderr,"block size out of range; impossible dealloc size!\n");
		fflush(stderr);
		return;
	}

	idx=block_size_lookup[size];
	if (idx<0||rat_block_sizes<idx)
	{
		fprintf(stderr,"block size lookup failed!\n");
		fflush(stderr);
		return;
	}

	block=(rat_block *)mem;
	block->next=bal->recycling_lists[idx];
	bal->recycling_lists[idx]=block;
}
Advertisements

5 comments

  1. WOW that’s sweet. Damn.

    I’ve never done a custom memory allocator. I don’t even know how that works…

    Could you briefly explain the whys?


  2. I think I need to really read exceptional C++ and more exceptional C++. It’s beennnnnnn soooooooo long since I last read a c++ book. Mind you I never formally took a course in c++ while in college.


  3. Well, I do know memory allocation algorithms. I need to research this some more.


  4. Anyways, I think I know the whys. To reduce fragmentation is one of the reason.

    Actually, I’ve implemented a “free block list” thingy during one of my game dev. interviews awhile back.


  5. I’ve never taken a formal course in anything technical (yet) and haven’t paid attention in any courses I’ve been forced through. I have no formal education in any of this either. The machine is not for me. I do want to go to college, though, and it’s my current goal. It will fill the gaps in my knowledge, I hope.

    Anyway, heap allocation is a generic method for allocation, but bogs down under extreme alloc/dealloc circumstances. For example, a thousand alloc/dealloc pairs in each iteration. You don’t really need to manipulate or even look at the heap every time. In fact, if it’s just a few blocks of the same approximate small size (like contact arrays in my case) being allocated and deallocated a shit load of times, we can really just reuse the memory blocks. Which is why the aggressive recycler model works well for this. It grows the block list whenever necessary, in those cases giving out the new blocks. When you deallocate, it really just pushes that block’s address on the recycle stack, and pops it right back off next allocation, so you’ve wasted no time with real deallocation and reallocation. Then, every iteration it is cleared, (empty the recycle bin hurr hurr,) preventing a memory hoard to build up.

    The stack allocator basically assumes all alloc/dealloc pairs are nested, so it can store all information on a simple stack instead of having to create any kind of map. This also has obvious speed advantages.



Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: