当前位置: 首页 > Linux kernrl, 内存管理 > 正文

内存管理器(十六)kernel内存管理—slab设计与实现2(初始化与创建)

内存管理器(十六)kernel内存管理—slab设计与实现2(初始化与创建)

前言

上一篇博文主要写了slab的设计与主要的数据结构,但是依然有很多的问题没有解决.

 

slab 实现

首先一个应用实例

本来想要先从初始化开始,但是又觉得直接开始分析实现不是很直观,不如先学会如何使用然后再回头看看如何实现,这也是一个正常且正确的认知过程。

还是使用我们最常用的工具内核模块

我们首先在自己的内核开发树中建立一个自己的例字结构体


#ifndef _TEST_H
#define _TEST_H
typedef struct TEST{
int num;
char *name;
}TEST;

#endif


开始看我们的内核模块

#include<linux/module.h>
#include<linux/slab.h>
#include<linux/mm.h>
#include<linux/gfp.h>
#include<linux/mm_types.h>
#include<linux/list.h>
#include<linux/types.h>
#include<linux/test.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("zmrlinux");
MODULE_DESCRIPTION("something about slab_create");
static struct kmem_cache *test;

static int __init slab_create(void){
struct TEST *pos;  /*三个示例*/
struct TEST *temp;
struct TEST *p;
printk("\n\n");
test = kmem_cache_create("TEST",sizeof(struct TEST),8,SLAB_RED_ZONE,((void *) 0));

/*kmem_cache_create 这个函数用来创建我们这个结构体的高速缓存节点*/
if(!test){
printk("create failed ");
return -1;
}else{
printk("I create a kmem_cache is ok ,i feel good\n");
}
printk("create a test slab of test\n");
pos = kmem_cache_alloc(test,GFP_KERNEL);

/*从我们的高速缓存区中分配一个结构体*/
if(NULL == pos){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create pos is ok\n");
printk("pos :%p\n",pos);
}
temp = kmem_cache_alloc(test,GFP_KERNEL);
if(NULL == temp){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create temp is ok \n");
printk("temp :%p\n",temp);
}
p = kmem_cache_alloc(test,GFP_KERNEL);
if(NULL == p){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create temp is ok\n");
printk("p :%p\n",p);
}
printk("betewn size is %ld\n",(temp-pos));
/*打印两个节点的距离*/

kmem_cache_free(test,pos);
kmem_cache_free(test,temp);

return 0;
}

static void __exit slab_out(void){

printk("BYBY :)\n");

}

module_init(slab_create);
module_exit(slab_out);

效果如下:

2015-11-09 21:45:59 的屏幕截图

可以看到我们已经成功的创建了高速缓存节点,并且从中获得了节点实例。

下来我们看看实现

_init 初始化

这里首先说一个问题,在初始化这个slab 的时候,内核需要若干小与一个整页的内存块,这些最适合kmalloc分配但是,只有在slab系统启动后才能启动kmalloc 所以,这里需要一些技巧,我们接着网下看。


/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/

/*在伙伴系统启动后,多个CPU启动之前开始初始化,所以到这里启动的时候只有一个启动CPU能工作,其他处理器还没启动,明确它的启动时机之后我们就可以看下一步了*/
void __init kmem_cache_init(void)
{
int i;

BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;  /*第一个高速缓存开始建立*/

if (num_possible_nodes() == 1) /*如果可能使用的节点为0,使用了位图*/
   use_alien_caches = 0;       /*可用的节点为0*/
for (i = 0; i < NUM_INIT_LISTS; i++)  
   kmem_cache_node_init(&init_kmem_cache_node[i]);
/*这里的循环主要是对于NUMA体系架构的,循环初始化每个CPU的私有高速缓存,但是对于SMP架构的暂时不说*/
/*
* Fragmentation (碎片) resistance(阻力) on low(低) memory - only use bigger
* page orders on(命令) machines with more than 32MB of memory if
* not overridden on the command line.
*/   /*确定最大的slab大小*/
if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
   slab_max_order = SLAB_MAX_ORDER_HI;

/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:引导时需要技巧的,尤其是对象还没有创建
* 1) initialize the kmem_cache cache: it contains(包含) the struct
* kmem_cache structures of all caches, except kmem_cache itself:
* kmem_cache is statically allocated.  首先创建的高速缓存是静态的
* Initially an __init data area is used for the head array and the
* kmem_cache_node structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap. 最初的时候创建slab的头部数组就是kmem_cache_no*de 结构体,这个数组将会被kmalloc分配的数组所替换,在自举的最后阶段
* 2) Create the first kmalloc cache. 创建第一个kmalloc的存储空间,这个时候kmalloc 
* 还不能适用,这里首先期望kmalloc 可以被使用。
* The struct kmem_cache for the new cache is allocated normally.这个高速缓存是普* 通创建的。
* An __init data area is used for the head array.(be used for用来做,最初的数据
* 域被用来做头数组
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.  使用静态数组的剩下的空间创建剩下的缓存
* 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.将头部数组替换成kmem_cache 控制结构* 体,将第一次为kmalloc 分配的cache 替换为kmalloc 分配的数组。
* 5) Replace the __init data for kmem_cache_node for kmem_cache and
* the other cache's with kmalloc allocated memory.将最初的数据逐步替换为kmem_cach* e
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/  /*逐步替换*/ 调整头数组的最终大小

/* 1) create the kmem_cache */

/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/

/*创建第一个高速缓存实例,很多东西都是静态写好的,他的大小却绝于CPU等信息*/
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN);
/*创建kmem_cache 最终调用了__kmem_cache_create()函数,这个函数在下边有介绍*/
list_add(&kmem_cache->list, &slab_caches);/*将kmem_cache 加入kmem_caches链表*/
slab_state = PARTIAL;
/*将缓存创建后,设置缓存的出状态*/
/*
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/ /*为高速缓存节点创建提供内存而初始化一部分缓存*/
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;

/*修改标记*/

slab_early_init = 0;

/* 5) Replace the bootstrap kmem_cache_node */
{      /*替换启动时的kmem_cache_node*/
int nid;

for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);

init_list(kmalloc_caches[INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}

create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
}

 


void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;    /*高速缓存节点*/

slab_state = UP;

/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);  /*获取锁,遍历一边寻找是否初始化过这个节点,如果有直接BUG*/
list_for_each_entry(cachep, &slab_caches, list)
if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&slab_mutex); /*解锁*/

/* Done! */
slab_state = FULL;  /*初始化完成*/

/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);

#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
* node.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif

/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}

static int __init cpucache_init(void)  /*初始化CPU的缓存,将不需要的页返回*/
{
int cpu;

/*
* Register the timers that return unneeded pages to the page allocator
*/
for_each_online_cpu(cpu)
start_cpu_timer(cpu);

/* Done! */
slab_state = FULL;
return 0;

创建一个高速缓存,就是我们上边用的kmem_cache_create


/*
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*以上是参数列表,分别是名称,大小,对齐,SLAB 标识,构造函数(现在不太用了)
* Returns a ptr to the cache on success, NULL on failure.
* Cannot beint called within a inerrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.中毒标记
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns. 红色警戒区,防止越界,即使在越界的时候也能发现
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.对齐
*/
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s; /*分配目标变量*/
const char *cache_name; /*需要分配缓存区的名字*/
int err;

get_online_cpus();  /*关掉内核抢占,防并发访问,并且给CPU引用计数+1*/
get_online_mems();  
memcg_get_cache_ids(); /*获得内存节点ID*/

mutex_lock(&slab_mutex);/*获得slab的锁*/

err = kmem_cache_sanity_check(name, size); /*name 冲突检查*/
if (err) {
    s = NULL; /* suppress uninit var warning */
    goto out_unlock;
}

/*
* Some allocators will constraint(约束) the set of valid(有效) flags to a subset(子集)
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized (干净)version(版本) of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;   /*设置过干净的基本的标志后开始进行分配操作了*/

s = __kmem_cache_alias(name, size, align, flags, ctor);/*第一次分配请看下边详细分解*/
if (s){   /*分配成功直接返回地址,并且处理一些后序工作*/
   goto out_unlock;
}
cache_name = kstrdup_const(name, GFP_KERNEL);   /*寻找一块空间保存这个名字的字符串*/
if (!cache_name) {  /*如果名字保存失败,就退出,这里注意一个函数__do_kmalloc(分配内存)可以看出这里已经可以使用kmalloc 来分配内存了*/
     err = -ENOMEM;
     goto out_unlock;
}

s = do_kmem_cache_create(cache_name, size, size,
                      calculate_alignment(flags, align, size),
     flags, ctor, NULL, NULL); /*创建一个高速缓存,并且初始化信息*/
   if (IS_ERR(s)) {
      err = PTR_ERR(s);
      kfree_const(cache_name);
   }

out_unlock:
     mutex_unlock(&slab_mutex); /*解锁高速缓存链表锁*/
     memcg_put_cache_ids();     /*释放之前持有的资源*/
     put_online_mems();
     put_online_cpus();

     if (err) {
       if (flags & SLAB_PANIC)
          panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
        name, err);
       else {
         printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
name, err);
       dump_stack();
        }
       return NULL;
     }
      return s;
  }

__kmem_cache_alias 第一次详细分配


struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *cachep;

cachep = find_mergeable(size, align, flags, name, ctor);  /*尝试在高速缓存的链表中寻找一个给出来,这个函数的详细过程看下边*/
if (cachep) {/*如果分配好了引用计数+1,并且调整最后的大小*/
      cachep->refcount++;
      
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
      cachep->object_size = max_t(int, cachep->object_size, size);
}
      return cachep;
}


/*这里进行参数处理,并且准备从已经存在的缓存中找出东西*/

struct kmem_cache *find_mergeable(size_t size, size_t align,
unsigned long flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;

if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
return NULL;

if (ctor)
return NULL;

size = ALIGN(size, sizeof(void *));   /*对齐算出我们需要的大小,宏用法同下*/
align = calculate_alignment(flags, align, size);/*对齐,虽然有默认的对齐值,内核会一直将对齐值处以2,井可能多的将对象放入行缓存中*/
size = ALIGN(size, align); /*我只想说这个宏有毒,(x,(typeof(x))(a)-1) ==>  (((x) + (mask)) & ~(mask)),我是被这个宏的用法惊呆了*/

flags = kmem_cache_flags(size, flags, name, NULL);
/*尝试从slab_caches中获取一个可以使用的kmem_cache*/
list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))  /*寻找一个可以合并的slab cache*/
    continue;

if (size > s->size)   /*其大小应当大与等与我们需要的大小*/
    continue;

if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
    continue;    /*标识相同*/
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
*/
if ((s->size & ~(align - 1)) != s->size)
    continue;   /*必须是对齐的*/
    
if (s->size - size >= sizeof(void *))
    continue;  /*两个大小的差值必须在体系结构之内*/

if (IS_ENABLED(CONFIG_SLAB) && align &&
(align > s->align || s->align % align))
    continue;

   return s;
}
   return NULL;
}

***__kmem_cache_create(s,flags)***
看这个函数,在初始化时候的调用的函数,在调用这个函数之前其实还调用了create_boot_cache(),下边我删除一些DEBUG的东西。便于看清结构。

int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{   /*首先对齐到处理器字长的步数*/
 size_t left_over, freelist_size;
 size_t ralign = BYTES_PER_WORD;
 gfp_t gfp;
 int err;
 size_t size = cachep->size;

 /*
 * Check that size is in terms of words. This is needed to avoid
 * unaligned accesses for some archs when redzoning is used, and makes
 * sure any on-slab bufctl's are also correctly aligned.
 */
 if (size & (BYTES_PER_WORD - 1)) {
 size += (BYTES_PER_WORD - 1);
 size &= ~(BYTES_PER_WORD - 1);
 }

 if (flags & SLAB_RED_ZONE) {
 ralign = REDZONE_ALIGN;
 /* If redzoning, ensure that the second redzone is suitably
 * aligned, by adjusting the object size accordingly. */
 size += REDZONE_ALIGN - 1;
 size &= ~(REDZONE_ALIGN - 1);
 }

 /* 3) caller mandated alignment */
 if (ralign < cachep->align) {
 ralign = cachep->align;
 }
 /* disable debug if necessary */
 if (ralign > __alignof__(unsigned long long))
 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 /*
 * 4) Store it.  /*计算出对齐值并保存*/
 */
 cachep->align = ralign;

 if (slab_is_available())
     gfp = GFP_KERNEL;
 else
     gfp = GFP_NOWAIT;

 /*
 * Determine if the slab management is 'on' or 'off' slab.
 * (bootstrapping cannot cope with offslab caches so don't do
 * it too early on. Always use on-slab management when
 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
 */
 if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
 !(flags & SLAB_NOLEAKTRACE))
 /*
 * Size is large, assume best to place the slab management obj
 * off-slab (should allow better packing of objs).
 */
 flags |= CFLGS_OFF_SLAB;

 size = ALIGN(size, cachep->align);
 /*
 * We should restrict the number of objects in a slab to implement
 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
 */
 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
      size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
  
 left_over = calculate_slab_order(cachep, size, cachep->align, flags);
 /*计算出理想的slab长度,太小增加开销。太大则想对的降低了伙伴算法的效果,这是一个迭代的过程*/  
 if (!cachep->num)
      return -E2BIG;

 freelist_size = calculate_freelist_size(cachep->num, cachep->align);

 /*
 * If the slab has been placed off-slab, and we have enough space then
 * move it on-slab. This is at the expense of any extra colouring.
 */
 /*确定到底是将slab的头部放到slab的外边还是里边*/
 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
      flags &= ~CFLGS_OFF_SLAB;
      left_over -= freelist_size;
 }

 if (flags & CFLGS_OFF_SLAB) {
 /* really off slab. No need for manual alignment */
 freelist_size = calculate_freelist_size(cachep->num, 0);

#ifdef CONFIG_PAGE_POISONING
 /* If we're going to use the generic kernel_map_pages()
 * poisoning, then it's going to smash the contents of
 * the redzone and userword anyhow, so switch them off.
 */
 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
      flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
 }

 cachep->colour_off = cache_line_size();
 /* Offset must be a multiple of the alignment. */
 if (cachep->colour_off < cachep->align)
 cachep->colour_off = cachep->align;
 cachep->colour = left_over / cachep->colour_off;
 cachep->freelist_size = freelist_size;
 cachep->flags = flags;
 cachep->allocflags = __GFP_COMP;
 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
     cachep->allocflags |= GFP_DMA;
 cachep->size = size;
 cachep->reciprocal_buffer_size = reciprocal_value(size);

 if (flags & CFLGS_OFF_SLAB) {
 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
 /*
 * This is a possibility for one of the kmalloc_{dma,}_caches.
 * But since we go off slab only for object size greater than
 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
 * in ascending order,this should not happen at all.
 * But leave a BUG_ON for some lucky dude.
 */
 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
 }

 err = setup_cpu_cache(cachep, gfp);
 if (err) {
 __kmem_cache_shutdown(cachep);
 return err;
 }
  /*创建cachep缓存完*/
 return 0;
}

关于likely 与 unlikely 的一点说明:
在linux kernel 中,我们经常可以看到likely和unlikely,这其实是两个宏。
#define likely(x) (__builtin_constant_p(x) ?!!(x) :__branch_check__(x,1))
#define unlikely(x) (__builtin_constant_p(x)?!!(x) :__branch_check__(x,0))
这里其实是对编译的一个优化,这里指出了哪一种情况发生的概率更大,对于编译器来说,编译器会将最可能发生的以种情况直接放在判断语句的下方,以求模拟出最真实的一种效果:最可能的条件最快执行不发生指令跳转。

本文固定链接: http://zmrlinux.com/2015/11/11/%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86%e5%99%a8%ef%bc%88%e5%8d%81%e4%ba%94%ef%bc%89kernel%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86-slab%e8%ae%be%e8%ae%a1%e4%b8%8e%e5%ae%9e%e7%8e%b02%ef%bc%88%e5%88%9d/ | Kernel & Me

该日志由 root 于2015年11月11日发表在 Linux kernrl, 内存管理 分类下, 你可以发表评论,并在保留原文地址及作者的情况下引用到你的网站或博客。
原创文章转载请注明: 内存管理器(十六)kernel内存管理—slab设计与实现2(初始化与创建) | Kernel & Me
    • 谢谢学长,等我把这个系列写完了整理一个文档,共享出来,你们到时候面试也能参考参考。。嘿嘿