当前位置: 首页 > Linux kernrl, 内存管理 > 正文

内存管理器(十七)kernel内存管理—-slab设计与实现(分配对象)

内存管理器(十七)kernel内存管理—-slab设计与实现(分配对象)

前言

上一篇主要写了slab的初始化与创建一个高速缓存。这一篇主要分析slab 是如何分配对象的。

void * kmem_cache_alloc(kmem_cache_t *cachep,gfp_t flags)  这个是主要分析的对象。

__start


/**
* kmem_cache_alloc - Allocate an object  分配一个对象
* @cachep: The cache to allocate from.   从哪一个缓存区分配
* @flags: See kmalloc().  ...这注释真是醉了,不过没关系,我在前边分析数据结构的时候已经分* 析过了,没看过的小伙伴去看看吧。
*
* Allocate an object from this cache. The flags are only relevant 分配一个对象从这个缓存区,如果没有空闲的对象那么这个标识也没有什么用。
* if the cache has no available objects.
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
   void *ret = slab_alloc(cachep, flags, _RET_IP_);

   trace_kmem_cache_alloc(_RET_IP_, ret,
       cachep->object_size, cachep->size, flags);
   /*一个追踪函数,调试时使用*/
   return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);


static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
    unsigned long save_flags;
    void *objp; /*分配目标*/

    flags &= gfp_allowed_mask;

    lockdep_trace_alloc(flags); /*一个追踪函数*/

    if (slab_should_failslab(cachep, flags))  /*检查这个指定的标志,是否会导致不可分配*/
     return NULL;

    cachep = memcg_kmem_get_cache(cachep, flags);  /* 这个函数看下边第一个函数*/

    cache_alloc_debugcheck_before(cachep, flags);/*这个函数由两个作用:1:将标志设置为可以等待的(__GFP_WAIT).2:如果定义了调试(if DEBUG)用来调试标记
    local_irq_save(save_flags);/*本地中断中态保存保存*/
    objp = __do_cache_alloc(cachep, flags); /*分配主要函数,看下边有详解*/
    local_irq_restore(save_flags); /*恢复本地中断*/
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);/*DEBUG*/
    kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,flags);
    prefetchw(objp);

    if (likely(objp)) {
        kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
    if (unlikely(flags & __GFP_ZERO))
        memset(objp, 0, cachep->object_size);
    }

    memcg_kmem_put_cache(cachep);
        return objp;
}

各个函数分析

1.memcg_kmem_get_cache( )


/**
* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation 从per-memcg 中选择一个正确的高* * 速缓存来分配内存
* @cachep: the original(原始) global kmem cache ;全局的缓存链表节点,双向循环链表可以从任意一个结点* 到任意的地方
* @gfp: allocation flags. 分配的标识
*
* All memory allocated from a per-memcg cache is charged to the owner memcg.
*/
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
    if (!memcg_kmem_enabled()) /*当有如下四种情况的时候直接返回*/
        return cachep;
    if (gfp & __GFP_NOFAIL)
        return cachep;
    if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
        return cachep;
    if (unlikely(fatal_signal_pending(current)))
        return cachep;

    return __memcg_kmem_get_cache(cachep);
}

如果其他的情况都不出线的情况下,我们进入这个return 函数

mm/memcontrol.c
/*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
 * 返回 kmem_cache 结构 我们应当使用它来分配slab
 * If the cache does not exist yet, if we are the first user of it,
 * we either create it immediately(马上), if possible, or create it asynchronously(异步)
 * in a workqueue.如果这个缓存没有没被创建,或者我们第一次使用它。满足其中一个条件我们就开始创建一个这样的缓 * 存,如果可能,或者异步的创建它在工作队列中。
 * In the latter(后者) case, we will let the current allocation go through with
 * the original cache.在后一种情况下,我们允许通过使用最初的缓存分配
 *
 * Can't be called in interrupt context or from kernel threads.不能在中断上下文中或者内核线程中* * 调用这个函数。
 * This function needs to be called with rcu_read_lock() held.这个函数需要调用 rcu_read_lock( * )RCU读锁是一种在2.6新加入的锁,主要面对大量读取却很少写的情况
 */
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
 struct mem_cgroup *memcg;
                |||||看这个数据结构:看下边

 struct kmem_cache *memcg_cachep;
 int kmemcg_id;
 
 VM_BUG_ON(!is_root_cache(cachep));
 /*BUG 检查*/
 if (current->memcg_kmem_skip_account)
      return cachep;

 memcg = get_mem_cgroup_from_mm(current->mm); /*从当前进程得到memcg*/
 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
/*又是一个魔性的宏:
 *#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x));
 *ACCESS_ONCE就是对这个变量取一次值,他采用了volatile,就使得所有访问该变量时都会从变量的 *地址中重新获取,而不会用缓存的值。但对于底层CPU来说,这个宏不起任何作用。主要是为了协调进程 *级别的代码和IRQ中断代码间的变量值的一致性。
 */
 if (kmemcg_id < 0)
     goto out;

 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);/*这个函数看下边*/
 if (likely(memcg_cachep))
     return memcg_cachep;

 /*
 * If we are in a safe context (can wait, and not in interrupt
 * context), we could be be predictable and return right away.
 * This would guarantee that the allocation being performed
 * already belongs in the new cache.
 *
 * However, there are some clashes that can arrive from locking.
 * For instance, because we acquire the slab_mutex while doing
 * memcg_create_kmem_cache, this means no further allocation
 * could happen with the slab_mutex held. So it's better to
 * defer everything.
 */
  如果我们在一个安全的中断上下文中(可以写入,且不在一个中断上下文中)我们可以预测正确的返回,这样就可以保证分配了一个新的缓存。
 memcg_schedule_kmem_cache_create(memcg, cachep);/*这个函数看下边*/
out:
 css_put(&memcg->css); 修改状态,结束
     return cachep;
}

mem_cgroup  数据结构

/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it’s low water mark, this is
* a feature that will be implemented much later in the future.
*/

这是一个控制结构体,这个内存控制结构体包含所有的页信息与缓存信息。我们最终想要提供静态的信息帮助管理员确定调整怎样的值。

备忘录:添加一个界限标志到内存控制结构中,当我们超过界限的时候就开始回收内存。可能添加一个低线,即使这样也没有发生回收在cgroup  的标记界线上,这是一种特征实现在未来。
struct mem_cgroup

cache_from_memcg_idex(struct kmem_cache *s,int idx)


/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
* That said the caller must assure the memcg's cache won't go away by either
* taking a css reference to the owner cgroup, or holding the slab_mutex.
*/

注意:我们使用RCU锁值是保护memcg_caches 这个数组,而不是per-memcg (高速缓存)这就是说调用者必须保证memcg 的缓存不会离开被提到所有者的cgroup 中去,(cgroup 这个东西下次另说),或者能保证持有slab_mutex 高速缓存全局锁。
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
        struct kmem_cache *cachep;
        struct memcg_cache_array *arr;

        rcu_read_lock(); /*上RCU 读锁*/
        arr = rcu_dereference(s->memcg_params.memcg_caches);  /*获得全局内存控制结构数组*/

/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
* memcg_create_kmem_cache()).
*/
      cachep = lockless_dereference(arr->entries[idx]);
      rcu_read_unlock();

      return cachep; /*获取一个缓存节点*/
}

&nbsp;

memcg_schedule_kmem_cache_create(  )


static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
* in __memcg_schedule_kmem_cache_create will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
* trigger an allocation. This too, will make us recurse. Because at
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
    current->memcg_kmem_skip_account = 1;
    __memcg_schedule_kmem_cache_create(memcg, cachep);  /*看下边*/
    current->memcg_kmem_skip_account = 0;
}

/*
* Enqueue the creation of a per-memcg kmem_cache. 将创建的缓存入队
*/
    static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
   struct kmem_cache *cachep)
   {
      struct memcg_kmem_cache_create_work *cw;

/*这个结构体:struct memcg_kmem_cache_create_work{

struct mem_cgroup *memcg;

struct kmem_cache *cachep;

struct work_struct work  ;

}*/

     cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
     if (!cw)
        return;

     css_get(&memcg->css);

     cw->memcg = memcg;
     cw->cachep = cachep;
     INIT_WORK(&cw->work, memcg_kmem_cache_create_func);

     schedule_work(&cw->work); /*将工作添加进全局的链表上*/
}

static __always_inline void * __do_cache_alloc(struct kmem_cache,gfp_t flags)
这才是我们分配对象的重量级函数

static __always_inline void *
__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
     void *objp;

    if (current->mempolicy || cpuset_do_slab_mem_spread()) {/* NUMA选项相关*/
       objp = alternate_node_alloc(cache, flags);
     if (objp)
        goto out;
   /*以上这一部分都是对于NUMA架构设计的函数,如果不是就NUMA架构,就会返回NULL */
   }
   objp = ____cache_alloc(cache, flags);  /*我们分配的主力函数,这个函数一定要认真分析看下边*/

 /*
 * We may just have run out of memory on the local node.
 * ____cache_alloc_node() knows how to locate memory on other nodes
 */
     if (!objp)
       objp = ____cache_alloc_node(cache, flags, numa_mem_id());

  out:
    return objp;
}

<mm/slab.c> /*我们分配空间的主力函数*/
static inline void * ____cache_alloc(struct kmem_cache *cachep,gfp_t flags)

   /*首先说明这个不是NUMA架构下的函数*/
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;  /*返回的目标指针*/
    struct array_cache *ac; /*per-cpu 缓存结构体的控制结构*/
    bool force_refill = false;  /*如果没有空闲空间则需要重新分配,这就是是否需要重新分配的标志,因为现在还不知道具体情况则暂时设定为否定的*/
    check_irq_off();
    /*检查确定在中断上下文中*/
    ac = cpu_cache_get(cachep); /*获得CPU的高速缓存,实例化ac就是per-cpu.这个函数是使用了多个宏,最终通过汇编实现*/
    if (likely(ac->avail)) {  /*avail 是现在可以使用的空闲内存块的各数,并且使用了likely 优化了代码*/
       ac->touched = 1;       /*在需要取处一个缓存的时候,会将touched 置1*/   
       objp = ac_get_obj(cachep, ac, flags, false); 
       /*通过这个函数分配空间 objp = ac->entry[--ac->avail];从per-cpu 分配对象*/
 /*
 * Allow for the possibility all avail objects are not allowed
 * by the current flags
 */
  /*但是也可能因为标记的原因,不能分配成功*/
    if (objp) {  /*如果成功就分配并且结束*/
       STATS_INC_ALLOCHIT(cachep);  /*初始化分配*/
    goto out;
    }
    force_refill = true; /*没有分配到就需要重新创建一个slab*/
 }

    STATS_INC_ALLOCMISS(cachep);   /*初始化失败,我们开始准备重新填充per-cpu*/
    objp = cache_alloc_refill(cachep, flags, force_refill);/*看下边*/
 /*
 * the 'ac' may be updated by cache_alloc_refill(),
 * and kmemleak_erase() requires its correct value.
 */
    ac = cpu_cache_get(cachep);

out:
 /*
 * To avoid a false negative, if an object that is in one of the
 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
 * treat the array pointers as a reference to the object.
 */
 if (objp)
      kmemleak_erase(&ac->entry[ac->avail]);
 return objp;
}

static void *cache_alloc_refill(struct kmem_cache *cachep,gfp_t flags,bool force_refill)

/*重新分配slab 对象填充per-cpu

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
 bool force_refill)
{
     int batchcount;    /*这里用来是没有使用的块的个数*/
     struct kmem_cache_node *n;/*分配节点指针*/
     struct array_cache *ac;  /*per-cpu缓存*/
     int node;          /*NUMA 节点,这里我们暂时不考虑*/

     check_irq_off();   /*确定在中断上下文里*/
     node = numa_mem_id();   /*获得NUMA 缓存节点,这个架构高速缓存与CPU的数量一一对应*/
     if (unlikely(force_refill)) /*增长缓存,还是期望这个表达式为假*/
          goto force_grow;
retry:  /*已经填充失败*/
     ac = cpu_cache_get(cachep);/*获得per-cpu缓存*/
     batchcount = ac->batchcount; /*获取可用节点个数*/
     if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
     /*
      * If there was little recent activity on this cache, then
      * perform only a partial refill. Otherwise we could generate
      * refill bouncing.
      */
     /*如果这个缓存还有一些空闲的节点则填充一半,否则全部填充*/
       batchcount = BATCHREFILL_LIMIT;
    }
       n = get_node(cachep, node);/*获得slab头部*/

       BUG_ON(ac->avail > 0 || !n);
       spin_lock(&n->list_lock); /*获得这个slab的自旋锁*/
 
      /* See if we can refill from the shared array */
      /* 如果能从共享的数组转移就转移共享的节点过来*/
       if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
          n->shared->touched = 1;
          goto alloc_done;
     }

     while (batchcount > 0) { /*增长的数目大于0*/
          struct list_head *entry;
          struct page *page;
          /* Get slab alloc is to come from. */
          /*开始扫描所有的半满空闲链表*/
          entry = n->slabs_partial.next;
          if (entry == &n->slabs_partial) {
              n->free_touched = 1;
              entry = n->slabs_free.next;
              if (entry == &n->slabs_free)
                   goto must_grow;
              /*这里使用双向循环链表,所以会很快分辨出链表是否已经满了,如果满了就需要增长了*/
          }
          page = list_entry(entry, struct page, lru);
         /*获得这个页的内存*/
          check_spinlock_acquired(cachep);
        /*检查获取自旋锁*/
       /*
        * The slab was either on partial or free list so
        * there must be at least one object available for
        * allocation.
        */
          BUG_ON(page->active >= cachep->num);

         while (page->active < cachep->num && batchcount--) {
                STATS_INC_ALLOCED(cachep);
                STATS_INC_ACTIVE(cachep);
                STATS_SET_HIGH(cachep);
        /*修改状态*/
               ac_put_obj(cachep, ac, slab_get_obj(cachep, page,node));
 }

         /* move slabp to correct slabp list: */ /*将slab放置到正确的地方*/
         list_del(&page->lru);
         if (page->active == cachep->num)
             list_add(&page->lru, &n->slabs_full);
          else
             list_add(&page->lru, &n->slabs_partial);
         }

    must_grow:
            n->free_objects -= ac->avail;
    alloc_done:
            spin_unlock(&n->list_lock);

         if (unlikely(!ac->avail)) {
                 int x;
  force_grow: /*缓存的增长*/
        x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
         /*这个函数看下边有祥解*/
      /* cache_grow can reenable interrupts, then ac could change. */
      ac = cpu_cache_get(cachep); /*获得这个CPU 的高速缓存*/
      node = numa_mem_id();  /*NUMA 架构选项*/

      /* no objects in sight? abort */  /*没有可用对象中断之*/
      if (!x && (ac->avail == 0 || force_refill))
          return NULL;

      if (!ac->avail) /* objects refilled by interrupt? */
          goto retry;/*如果没有获得空闲对象*/
      }
      ac->touched = 1;

      return ac_get_obj(cachep, ac, flags, force_refill);
}

<mm/slab.c>
static int cache_grow(struct kmem_cache *cachep,gfp_t flags,int nodeid,struct page *page)

/*
 * Grow (by 1) the number of slabs within a cache. This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
/*增长cache 的缓存,当没有可以使用的对象的时候明就调用kmem_cache_alloc()重新创建一个缓存*/
static int cache_grow(struct kmem_cache *cachep,
 gfp_t flags, int nodeid, struct page *page)
{
     void *freelist;
     size_t offset;
     gfp_t local_flags;
     struct kmem_cache_node *n;

 /*
 * Be lazy and only check for valid flags here, keeping it out of the
 * critical path in kmem_cache_alloc().
 */
 /*惰性的只检查有效的标识在这里,保证来自分配函数kmem_cache_alloc()*/
    if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
        pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
        BUG();
    }
  /*检测BUG*/
    local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
  /*标记约束的以及可以回收的*/
 /* Take the node list lock to change the colour_next on this node */
   /*获得节点链表锁并且改变颜色*/
     check_irq_off();/*关闭中断,免打扰*/
     n = get_node(cachep, nodeid); /*获得cache 节点*/
     spin_lock(&n->list_lock);  /*获得自旋锁*/

 /* Get colour for the slab, and cal the next value. */
 /* 给slab 上色,并且指向下一个节点*/
       offset = n->colour_next;
       n->colour_next++;
      if (n->colour_next >= cachep->colour)
        n->colour_next = 0;
      spin_unlock(&n->list_lock); /*着色完毕后,解除自旋锁*/

      offset *= cachep->colour_off; /*获得偏移量*/

     if (local_flags & __GFP_WAIT) /*如果发现可以等待就激活中断触发*/
          local_irq_enable();

 /*
 * The test for missing atomic flag is performed here, rather than
 * the more obvious place, simply to reduce the critical path length
 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
 * will eventually be caught here (where it matters).
 */
    kmem_flagcheck(cachep, flags);/*DMA 相关*/

 /*
 * Get mem for the objs. Attempt to allocate a physical page from
 * 'nodeid'.
 */
 /*为对象获取物理页内存空间,请求一个物理页*/
 if (!page)
       page = kmem_getpages(cachep, local_flags, nodeid);
 if (!page)/*获得失败就结束*/
      goto failed;

 /* Get slab management. */
 /*获得slab 控制结构*/
 freelist = alloc_slabmgmt(cachep, page, offset,
   local_flags & ~GFP_CONSTRAINT_MASK, nodeid); /*我们在这里为slab管理单元分配空间*/
     if (!freelist)
        goto opps1;

    slab_map_pages(cachep, page, freelist);/*给页上标记*/

    cache_init_objs(cachep, page); /*初始化对象*/

   if (local_flags & __GFP_WAIT) /*可以等待,则激活中断响应*/
        local_irq_disable();
  check_irq_off();           /*中断关闭*/
  spin_lock(&n->list_lock);  /*上自旋锁*/

  /* Make slab active. */
  list_add_tail(&page->lru, &(n->slabs_free)); /*插入节点*/
  STATS_INC_GROWN(cachep); /*增长标记++*/
  n->free_objects += cachep->num; /*增加空闲对象计数*/
  spin_unlock(&n->list_lock); /*解开自旋锁*/
  return 1;
opps1:
   kmem_freepages(cachep, page);/*释放页*/
failed:
 if (local_flags & __GFP_WAIT)
     local_irq_disable();
 return 0;
}

本文固定链接: http://zmrlinux.com/2015/11/15/%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86%e5%99%a8%ef%bc%88%e5%8d%81%e4%b8%83%ef%bc%89kernel%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86-slab%e8%ae%be%e8%ae%a1%e4%b8%8e%e5%ae%9e%e7%8e%b0%ef%bc%88%e5%88%86/ | Kernel & Me

该日志由 root 于2015年11月15日发表在 Linux kernrl, 内存管理 分类下, 你可以发表评论,并在保留原文地址及作者的情况下引用到你的网站或博客。
原创文章转载请注明: 内存管理器(十七)kernel内存管理—-slab设计与实现(分配对象) | Kernel & Me