红联Linux门户
Linux帮助

Linux内存管理之slab机制(创建cache)

发布时间:2014-11-30 10:24:39来源:linux网站作者:bullbat

Linux内核中创建cache节点由函数kmem_cache_create()实现。

该函数的执行流程:

1,从全局cache_cache中获得cache结构,因为全局cache_cache初始化对象的大小就是kmem_cache结构的大小,所以返回的指针正好可以转换为cache结构;调用 kmem_cache_zalloc(&cache_cache, gfp);

2,获得slab中碎片大小,由函数calculate_slab_order()实现;

3,计算并初始化cache的各种属性,如果是外置式,需要用kmem_find_general_cachep(slab_size, 0u)指定cachep->slabp_cache,用于存放slab对象和kmem_bufctl_t[]数组;

4,设置每个CPU上得本地cache,setup_cpu_cache();

5,cache创建完毕,将其加入到全局slab cache链表中;


一、主实现

/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
* Note that kmem_cache_name() is not guaranteed to return the same pointer,
* therefore applications must manage it themselves.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline.  This can be beneficial if you're counting cycles as closely
* as davem.
*/ 
 /*创建slab系统顶层的cache节点。创建完成后,cache里并没有任何slab以及对象,只有当分配对象,并且cache中没有空闲对象时,才会创建新的slab。*/ 
struct kmem_cache* 
kmem_cache_create (const char*name, size_t size, size_t align, 
unsigned long flags, void (*ctor)(void*)) 

size_t left_over, slab_size, ralign; 
struct kmem_cache*cachep = NULL,*pc; 
gfp_t gfp; 
 
/*
* Sanity checks... these are all serious usage bugs.
*//* 安全性检查*/ 
if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 
size > KMALLOC_MAX_SIZE) { 
printk(KERN_ERR "%s: Early error in slab %s\n", __func__, 
name); 
BUG(); 

 
/*
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well.  Please see cpuup_callback
*/ 
 /* slab分配器是否已经初始化好,如果是内核启动阶段
 ,则只有一个cpu执行slab分配器的初始化动作,无需加锁,否则需要加锁*/ 
if (slab_is_available()) { 
get_online_cpus(); 
mutex_lock(&cache_chain_mutex); 

/* 遍历cache链,做些校验工作*/ 
list_for_each_entry(pc, &cache_chain, next) { 
char tmp; 
int res; 
 
/*
* This happens when the module gets unloaded and doesn't
* destroy its slab cache and no-one else reuses the vmalloc
* area of the module.  Print a warning.
*/ 
/* 检查cache链表中的cache是否都有名字*/ 
res = probe_kernel_address(pc->name, tmp); 
if (res) {/*没有名字,报错*/ 
printk(KERN_ERR 
   "SLAB: cache with size %d has lost its name\n", 
   pc->buffer_size); 
continue; 

 /* 检查cache链表中是否已经存在相同名字的cache*/ 
if (!strcmp(pc->name, name)) { 
printk(KERN_ERR 
   "kmem_cache_create: duplicate cache %s\n", name); 
dump_stack(); 
goto oops; 


 
#if DEBUG  
WARN_ON(strchr(name, ' ')); /* It confuses parsers*/ 
#if FORCED_DEBUG  
/*
* Enable redzoning and last user accounting, except for caches with
* large objects, if the increased size would increase the object size
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/ 
if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 
2* sizeof(unsigned long long))) 
flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 
if (!(flags & SLAB_DESTROY_BY_RCU)) 
flags |= SLAB_POISON; 
#endif  
if (flags & SLAB_DESTROY_BY_RCU) 
BUG_ON(flags & SLAB_POISON); 
#endif  
/*
* Always checks flags, a caller might be expecting debug support which
* isn't available.
*/ 
BUG_ON(flags & ~CREATE_MASK); 
 
/*
* Check that size is in terms of words.  This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/ 
if (size & (BYTES_PER_WORD - 1)) { 
size += (BYTES_PER_WORD - 1); 
size &= ~(BYTES_PER_WORD - 1); 

 
/* calculate the final buffer alignment:*/ 
 
/* 1) arch recommendation: can be overridden for debug*/ 
if (flags & SLAB_HWCACHE_ALIGN) { 
/*
* Default alignment: as specified by the arch code.  Except if
* an object is really small, then squeeze multiple objects into
* one cacheline.
*/ 
ralign = cache_line_size(); 
while (size <= ralign / 2) 
ralign /= 2; 
} else { 
ralign = BYTES_PER_WORD; 

 
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
* alignment if either is greater than BYTES_PER_WORD.
*/ 
if (flags & SLAB_STORE_USER) 
ralign = BYTES_PER_WORD; 
 
if (flags & SLAB_RED_ZONE) { 
ralign = REDZONE_ALIGN; 
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly.*/ 
size += REDZONE_ALIGN - 1; 
size &= ~(REDZONE_ALIGN - 1); 

 
/* 2) arch mandated alignment*/ 
if (ralign < ARCH_SLAB_MINALIGN) { 
ralign = ARCH_SLAB_MINALIGN; 

/* 3) caller mandated alignment*/ 
if (ralign < align) { 
ralign = align; 

/* disable debug if necessary*/ 
if (ralign > __alignof__(unsigned long long)) 
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 
/*
* 4) Store it.
*/ 
align = ralign; 
/* slab分配器是否已经可用*/ 
if (slab_is_available()) 
gfp = GFP_KERNEL; 
else 
/* slab初始化好之前,不允许阻塞,且只能在低端内存区分配*/ 
gfp = GFP_NOWAIT; 
 
/* Get cache's description obj.*/ 
/* 获得struct kmem_cache对象 ,为什么能从cache中获得的对象是
kmem_cache结构呢,因为这里的全局变量cache_cache的对象大小
就是kmem_cache结构大小*/ 
cachep = kmem_cache_zalloc(&cache_cache, gfp); 
if (!cachep) 
goto oops; 
 
#if DEBUG  
cachep->obj_size = size; 
 
/*
* Both debugging options require word-alignment which is calculated
* into align above.
*/ 
if (flags & SLAB_RED_ZONE) { 
/* add space for red zone words*/ 
cachep->obj_offset += sizeof(unsigned long long); 
size += 2* sizeof(unsigned long long); 

if (flags & SLAB_STORE_USER) { 
/* user store requires one word storage behind the end of
* the real object. But if the second red zone needs to be
* aligned to 64 bits, we must allow that much space.
*/ 
if (flags & SLAB_RED_ZONE) 
size += REDZONE_ALIGN; 
else 
size += BYTES_PER_WORD; 

#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)  
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 
&& cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 
cachep->obj_offset += PAGE_SIZE - size; 
size = PAGE_SIZE; 

#endif  
#endif  
 
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on.)
*/ 
 /* 确定slab管理对象的存储方式:内置还是外置
 。通常,当对象大于等于512时,使用外置方式
 。初始化阶段采用内置式。
 slab_early_init 参见kmem_cache_init函数*/ 
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/ 
flags |= CFLGS_OFF_SLAB; 
 
size = ALIGN(size, align); 
/* 获得slab中碎片的大小*/ 
left_over = calculate_slab_order(cachep, size, align, flags); 
/* cachep->num为该cache中每个slab的对象数,为0,表示为该对象创建cache失败*/ 
if (!cachep->num) { 
printk(KERN_ERR 
   "kmem_cache_create: couldn't create cache %s.\n", name); 
kmem_cache_free(&cache_cache, cachep); 
cachep = NULL; 
goto oops; 

/* 计算slab管理对象的大小,包括struct slab对象和kmem_bufctl_t数组*/ 
slab_size = ALIGN(cachep->num* sizeof(kmem_bufctl_t) 
  + sizeof(struct slab), align); 
 
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/ 
 
/* 如果这是一个外置式slab,并且碎片大小大于slab管理对象的大小
,则可将slab管理对象移到slab中,改造成一个内置式slab*/ 
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 
/* 除去off-slab标志位*/ 
flags &= ~CFLGS_OFF_SLAB; 
/* 更新碎片大小*/ 
left_over -= slab_size; 

 
if (flags & CFLGS_OFF_SLAB) { 
/* really off slab. No need for manual alignment*/ 
/* align是针对slab对象的,如果slab管理对象是外置存储
,自然不会像内置那样影响到后面slab对象的存储位置
,也就不需要对齐了*/ 
slab_size = 
cachep->num* sizeof(kmem_bufctl_t) + sizeof(struct slab); 
 
#ifdef CONFIG_PAGE_POISONING  
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/ 
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 
#endif  

/* cache的着色块的单位大小*/ 
cachep->colour_off = cache_line_size(); 
/* Offset must be a multiple of the alignment.*/ 
/* 着色块大小必须是对象要求对齐方式的倍数*/ 
if (cachep->colour_off < align) 
cachep->colour_off = align; 
 /* 计算碎片区需要多少个着色快*/ 
cachep->colour = left_over / cachep->colour_off; 
 /* slab管理对象的大小*/ 
cachep->slab_size = slab_size; 
cachep->flags = flags; 
cachep->gfpflags = 0; 
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 
cachep->gfpflags |= GFP_DMA; 
/* slab对象的大小*/ 
cachep->buffer_size = size; 
 /* 计算对象在slab中索引时用,参见obj_to_index函数*/ 
cachep->reciprocal_buffer_size = reciprocal_value(size); 
 
if (flags & CFLGS_OFF_SLAB) { 
/* 分配一个slab管理区域对象,保存在slabp_cache中,
这个函数传入的大小为slab_size,也就是分配slab_size大小的cache
,在slab创建的时候如果是外置式,那么需要从分配的这里面
分配出slab对象,剩下的空间放kmem_bufctl_t[]数组,
如果是内置式的slab,此指针为空*/ 
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/ 
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 

cachep->ctor = ctor; 
cachep->name = name; 
/* 设置每个cpu上的local cache*/ 
if (setup_cpu_cache(cachep, gfp)) { 
__kmem_cache_destroy(cachep); 
cachep = NULL; 
goto oops; 

 
/* cache setup completed, link it into the list*/ 
/* cache创建完毕,将其加入到全局slab cache链表中*/ 
list_add(&cachep->next, &cache_chain); 
oops: 
if (!cachep && (flags & SLAB_PANIC)) 
panic("kmem_cache_create(): failed to create slab `%s'\n", 
  name); 
if (slab_is_available()) { 
mutex_unlock(&cache_chain_mutex); 
put_online_cpus(); 

return cachep; 

其中,cache_cache

/* internal cache of cache description objs*/ 
static struct kmem_cache cache_cache = { 
.batchcount = 1, 
.limit = BOOT_CPUCACHE_ENTRIES, 
.shared = 1, 
.buffer_size = sizeof(struct kmem_cache),/*大小为cache结构,难怪名称为cache_cache*/ 
.name = "kmem_cache", 
};


二、计算slab碎片大小

/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
* @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
*
* This could be made much more intelligent.  For now, try to avoid using
* high order pages for slabs.  When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
*/ 
 /*计算slab由几个页面组成,同时计算每个slab中有多少对象*/ 
static size_t calculate_slab_order(struct kmem_cache*cachep, 
size_t size, size_t align, unsigned long flags) 

unsigned long offslab_limit; 
size_t left_over = 0; 
int gfporder; 
 
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 
unsigned int num; 
size_t remainder; 
/* 计算slab中对象数*/ 
cache_estimate(gfporder, size, align, flags, &remainder, &num); 
/* 对象数为0,表示此order下,一个对象都放不下,检查下一order*/ 
if (!num) 
continue; 
 
if (flags & CFLGS_OFF_SLAB) { 
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/ 
 /* 创建一个外置式slab时,要相应分配该slab的管理对象
 ,包含struct slab对象和kmem_bufctl_t数组,分配管理对象的流程就是分配普通对象的流程
 ,再来看一下分配对象的流程:
kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc-> cache_alloc_refill-> cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc
可以看出这里可能存在一个循环,循环的关键在于alloc_slabmgmt函数
,当slab管理对象是off-slab方式时,就形成了循环
。那么什么时候slab管理对象会采用外置式slab呢?显然当其管理的slab中对象很多
,从而kmem_bufctl_t数组很大,致使整个管理对象也很大,此时才会形成循环
。故需要对kmem_bufctl_t的数目做限制,下面的算法是很粗略的,既然对象大小为size时
,是外置式slab,那么我们假设管理对象的大小也是size,计算出kmem_bufctl_t数组的大小
,即此大小的kmem_bufctl_t数组一定会造成管理对象是外置式slab。之所以说粗略
,是指数组大小小于这个限制时,也不能确保管理对象一定是内置式slab。但这也不会引发错误
,因为还有一个slab_break_gfp_order阀门来控制每个slab所占页面数,通常其值为1,即每个slab最多两个页面
,外置式slab存放的都是大于512的大对象,所以
slab中不会有太多的大对  ,kmem_bufctl_t数组也不会很大,粗略判断一下就足够了。
*/ 
offslab_limit = size - sizeof(struct slab); 
offslab_limit /= sizeof(kmem_bufctl_t); 
/* 对象数目大于限制,跳出循环,不再尝试更大的order
,避免slab中对象数目过多
,此时计算的对象数也是有效的,循环一次没什么*/ 
if (num > offslab_limit) 
break; 

 
/* Found something acceptable - save it away*/ 
/* 每个slab中的对象数*/ 
cachep->num = num; 
 /* slab的order,即由几个页面组成*/ 
cachep->gfporder = gfporder; 
 /* slab中剩余空间(碎片)的大小*/ 
left_over = remainder; 
 
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/ 
 /* SLAB_RECLAIM_ACCOUNT表示此slab所占页面为可回收的
 ,当内核检测是否有足够的页面满足用户态的需求时
 ,此类页面将被计算在内,通过调用
 kmem_freepages()函数可以释放分配给slab的页框。由于是可回收的
 ,所以不需要做后面的碎片检测了*/ 
if (flags & SLAB_RECLAIM_ACCOUNT) 
break; 
 
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/ 
 /* slab_break_gfp_order为slab所占页面的阀门,超过这个阀门时
 ,无论碎片大小,都不再检测更高的order了*/ 
if (gfporder >= slab_break_gfp_order) 
break; 
 
/*
* Acceptable internal fragmentation?
*/ 
 /* slab所占页面的大小是碎片大小的8倍以上
 ,页面利用率较高,可以接受这样的order*/ 
if (left_over* 8 <= (PAGE_SIZE << gfporder)) 
break; 

/* 返回碎片大小*/ 
return left_over; 
}
 

三、查找指定大小cache

/*在general cache中分配一个struct kmem_cache对象。直接调用__find_general_cachep。*/ 
static struct kmem_cache*kmem_find_general_cachep(size_t size, gfp_t gfpflags) 

return __find_general_cachep(size, gfpflags); 

 

static inline struct kmem_cache*__find_general_cachep(size_t size, 
gfp_t gfpflags) 

struct cache_sizes*csizep = malloc_sizes; 
 
#if DEBUG  
/* This happens if someone tries to call
* kmem_cache_create(), or __kmalloc(), before
* the generic caches are initialized.
*/ 
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 
#endif  
if (!size) 
return ZERO_SIZE_PTR; 
/* 找到合适的malloc size*/ 
while (size > csizep->cs_size) 
csizep++; 
 
/*
* Really subtle: The last entry with cs->cs_size==ULONG_MAX
* has cs_{dma,}cachep==NULL. Thus no special case
* for large kmalloc calls required.
*/ 
#ifdef CONFIG_ZONE_DMA  
if (unlikely(gfpflags & GFP_DMA)) 
return csizep->cs_dmacachep; 
#endif  
/* 返回该大小级别的cache*/ 
return csizep->cs_cachep; 
}


四、设置CPU本地cache

/*配置local cache和slab三链。*/ 
static int __init_refok setup_cpu_cache(struct kmem_cache*cachep, gfp_t gfp) 

/* general cache初始化完毕,配置每个cpu的local cache*/ 
if (g_cpucache_up == FULL) 
return enable_cpucache(cachep, gfp); 
/* 此时处于系统初始化阶段,g_cpucache_up记录general cache初始化的进度
,比如PARTIAL_AC表示struct array_cache所在的cache已经创建,
PARTIAL_L3表示struct kmem_list3所在的cache已经创建
,注意创建这两个cache的先后顺序
。在初始化阶段只需配置主cpu的local cache和slab三链*/ 
if (g_cpucache_up == NONE) { 
/*
* Note: the first kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/ 
 /* 初始化阶段创建struct array_cache所在cache时进入这个流程
 ,此时struct array_cache所在的general cache还未创建
 ,只能使用静态分配的全局变量initarray_generic表示的local cache*/ 
cachep->array[smp_processor_id()] = &initarray_generic.cache; 
 
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
* the first cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/ 
 /* 创建struct kmem_list3所在的cache是在struct array_cache所在cache之后
 ,所以此时struct kmem_list3所在的
 cache也一定没有创建,也需要使用全局变量*/ 
set_up_list3s(cachep, SIZE_AC); 
/* 执行到这struct array_cache所在的cache创建完毕
,如果struct kmem_list3和struct array_cache位于同一个general cache中
,不会再重复创建了
,g_cpucache_up表示的进度更进一步*/ 
if (INDEX_AC == INDEX_L3) 
g_cpucache_up = PARTIAL_L3; 
else 
g_cpucache_up = PARTIAL_AC; 
} else { 
/* g_cpucache_up至少为PARTIAL_AC时进入这个流程,struct array_cache所在的
general cache已经建立起来,可以通过kmalloc分配了*/ 
cachep->array[smp_processor_id()] = 
kmalloc(sizeof(struct arraycache_init), gfp); 
 
if (g_cpucache_up == PARTIAL_AC) { 
/* struct kmem_list3所在cache仍未创建完毕,还需使用全局的slab三链*/ 
set_up_list3s(cachep, SIZE_L3); 
/* 后面将会分析kmem_cache_init函数,只有创建struct kmem_list3所在
cache时才会进入此流程,上面的代码执行完,struct kmem_list3所在
cache也就创建完毕可以使用了,更新g_cpucache_up*/ 
g_cpucache_up = PARTIAL_L3; 
} else { 
int node; 
for_each_online_node(node) { 
cachep->nodelists[node] =/* 通过kmalloc分配struct kmem_list3对象*/ 
kmalloc_node(sizeof(struct kmem_list3), 
gfp, node); 
BUG_ON(!cachep->nodelists[node]); 
/* 初始化slab三链*/ 
kmem_list3_init(cachep->nodelists[node]); 



/* 设置回收时间*/ 
cachep->nodelists[numa_node_id()]->next_reap = 
jiffies + REAPTIMEOUT_LIST3 + 
((unsigned long)cachep) % REAPTIMEOUT_LIST3; 
 
cpu_cache_get(cachep)->avail = 0; 
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 
cpu_cache_get(cachep)->batchcount = 1; 
cpu_cache_get(cachep)->touched = 0; 
cachep->batchcount = 1; 
cachep->limit = BOOT_CPUCACHE_ENTRIES; 
return 0; 
}