Linux 内核设计与实现2-阿里云开发者社区

Linux 内核设计与实现1：https://developer.aliyun.com/article/1597346

6、互斥体

（1）结构体

// include/linux/mutex.h
/*
 * Simple, straightforward mutexes with strict semantics:
 *
 * - only one task can hold the mutex at a time
 * - only the owner can unlock the mutex
 * - multiple unlocks are not permitted
 * - recursive locking is not permitted
 * - a mutex object must be initialized via the API
 * - a mutex object must not be initialized via memset or copying
 * - task may not exit with mutex held
 * - memory areas where held locks reside must not be freed
 * - held mutexes must not be reinitialized
 * - mutexes may not be used in hardware or software interrupt
 *   contexts such as tasklets and timers
 *
 * These semantics are fully enforced when DEBUG_MUTEXES is
 * enabled. Furthermore, besides enforcing the above rules, the mutex
 * debugging code also implements a number of additional features
 * that make lock debugging easier and faster:
 *
 * - uses symbolic names of mutexes, whenever they are printed in debug output
 * - point-of-acquire tracking, symbolic lookup of function names
 * - list of all locks held in the system, printout of them
 * - owner tracking
 * - detects self-recursing locks and prints out all relevant info
 * - detects multi-task circular deadlocks and prints out all affected
 *   locks and tasks (and only those tasks)
 */
struct mutex {
  /* 1: unlocked, 0: locked, negative: locked, possible waiters */
  atomic_t    count;
  spinlock_t    wait_lock;
  struct list_head  wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
  struct thread_info  *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
  const char    *name;
  void      *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map  dep_map;
#endif
};

（2）方法说明

函数定义在文件 include/linux/mutex.h 中

（3）mutex_lock 函数分析

mutex_lock 函数

// include/linux/mutex.h
extern void mutex_lock(struct mutex *lock);

// kernel/mutex.c
/***
 * mutex_lock - acquire the mutex
 * @lock: the mutex to be acquired
 *
 * Lock the mutex exclusively for this task. If the mutex is not
 * available right now, it will sleep until it can get it.
 *
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
 * memory where the mutex resides mutex must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
 *
 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
 *   checks that will enforce the restrictions and will also do
 *   deadlock debugging. )
 *
 * This function is similar to (but not equivalent to) down().
 */
void __sched mutex_lock(struct mutex *lock)
{
  might_sleep();
  /*
   * The locking fastpath is the 1->0 transition from
   * 'unlocked' into 'locked' state.
   */
  __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
  mutex_set_owner(lock);
}

EXPORT_SYMBOL(mutex_lock);

（4）应用比较

7、完成变量

（1）结构体

// include/linux/completion.h
/**
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
 * INIT_COMPLETION().
 */
struct completion {
  unsigned int done;
  wait_queue_head_t wait;
};


// include/linux/wait.h
struct __wait_queue_head {
  spinlock_t lock;
  struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

struct __wait_queue {
  unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void *private;
  wait_queue_func_t func;
  struct list_head task_list;
};

（2）方法说明

函数定义在文件 include/linux/completion.h 中

（3）complete 函数分析

complete 函数

// kernel/sched.c
void complete(struct completion *x)
{
  unsigned long flags;

  spin_lock_irqsave(&x->wait.lock, flags);
  x->done++;
  __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
  spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);

__wake_up_common 函数

// kernel/sched.c
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key)
{
  wait_queue_t *curr, *next;

  list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
    unsigned flags = curr->flags;

    if (curr->func(curr, mode, wake_flags, key) &&
        (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
      break;
  }
}

8、BLK：大内核锁

（1）方法说明

函数定义在文件 include/linux/smp_lock.h 中

（2）lock_kernel 函数分析

lock_kernel 函数

// include/linux/smp_lock.h
#define lock_kernel() do {          \
  _lock_kernel(__func__, __FILE__, __LINE__);   \
} while (0)

_lock_kernel 函数

// lib/kernel_lock.c
void __lockfunc _lock_kernel(const char *func, const char *file, int line)
{
  int depth = current->lock_depth + 1;

  trace_lock_kernel(func, file, line);

  if (likely(!depth)) {
    might_sleep();
    __lock_kernel();
  }
  current->lock_depth = depth;
}

__lock_kernel 函数

// lib/kernel_lock.c
static inline void __lock_kernel(void)
{
  do_raw_spin_lock(&kernel_flag);
}

9、顺序锁

（1）结构体

// include/linux/seqlock.h
typedef struct {
  unsigned sequence;
  spinlock_t lock;
} seqlock_t;

（2）方法说明

// include/linux/seqlock.h
/*
 * These macros triggered gcc-3.x compile-time problems.  We think these are
 * OK now.  Be cautious.
 */
#define __SEQLOCK_UNLOCKED(lockname) \
     { 0, __SPIN_LOCK_UNLOCKED(lockname) }

#define SEQLOCK_UNLOCKED \
     __SEQLOCK_UNLOCKED(old_style_seqlock_init)

#define seqlock_init(x)         \
  do {            \
    (x)->sequence = 0;      \
    spin_lock_init(&(x)->lock);   \
  } while (0)

#define DEFINE_SEQLOCK(x) \
    seqlock_t x = __SEQLOCK_UNLOCKED(x)

/* Lock out other writers and update the count.
 * Acts like a normal spin_lock/unlock.
 * Don't need preempt_disable() because that is in the spin_lock already.
 */
static inline void write_seqlock(seqlock_t *sl)
{
  spin_lock(&sl->lock);
  ++sl->sequence;
  smp_wmb();
}

static inline void write_sequnlock(seqlock_t *sl)
{
  smp_wmb();
  sl->sequence++;
  spin_unlock(&sl->lock);
}

static inline int write_tryseqlock(seqlock_t *sl)
{
  int ret = spin_trylock(&sl->lock);

  if (ret) {
    ++sl->sequence;
    smp_wmb();
  }
  return ret;
}

/* Start of read calculation -- fetch last complete writer token */
static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
{
  unsigned ret;

repeat:
  ret = sl->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }

  return ret;
}

/*
 * Test if reader processed invalid data.
 *
 * If sequence value changed then writer changed data while in section.
 */
static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
{
  smp_rmb();

  return (sl->sequence != start);
}


/*
 * Version using sequence counter only.
 * This can be used when code has its own mutex protecting the
 * updating starting before the write_seqcountbeqin() and ending
 * after the write_seqcount_end().
 */

typedef struct seqcount {
  unsigned sequence;
} seqcount_t;

#define SEQCNT_ZERO { 0 }
#define seqcount_init(x)  do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)

/* Start of read using pointer to a sequence counter only.  */
static inline unsigned read_seqcount_begin(const seqcount_t *s)
{
  unsigned ret;

repeat:
  ret = s->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }
  return ret;
}

/*
 * Test if reader processed invalid data because sequence number has changed.
 */
static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
{
  smp_rmb();

  return s->sequence != start;
}


/*
 * Sequence counter only version assumes that callers are using their
 * own mutexing.
 */
static inline void write_seqcount_begin(seqcount_t *s)
{
  s->sequence++;
  smp_wmb();
}

static inline void write_seqcount_end(seqcount_t *s)
{
  smp_wmb();
  s->sequence++;
}

/*
 * Possible sw/hw IRQ protected versions of the interfaces.
 */
#define write_seqlock_irqsave(lock, flags)        \
  do { local_irq_save(flags); write_seqlock(lock); } while (0)
#define write_seqlock_irq(lock)           \
  do { local_irq_disable();   write_seqlock(lock); } while (0)
#define write_seqlock_bh(lock)            \
        do { local_bh_disable();    write_seqlock(lock); } while (0)

#define write_sequnlock_irqrestore(lock, flags)       \
  do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
#define write_sequnlock_irq(lock)         \
  do { write_sequnlock(lock); local_irq_enable(); } while(0)
#define write_sequnlock_bh(lock)          \
  do { write_sequnlock(lock); local_bh_enable(); } while(0)

#define read_seqbegin_irqsave(lock, flags)        \
  ({ local_irq_save(flags);   read_seqbegin(lock); })

#define read_seqretry_irqrestore(lock, iv, flags)     \
  ({                \
    int ret = read_seqretry(lock, iv);      \
    local_irq_restore(flags);       \
    ret;              \
  })

10、禁止抢占

11、顺序与屏障

八、定时器和时间管理

1、节拍率 HZ

// include/linux/param.h

#ifndef _LINUX_PARAM_H
#define _LINUX_PARAM_H

#include <asm/param.h>

#endif

// arch/x86/include/asm/param.h
#include <asm-generic/param.h>

// include/asm-generic/param.h
#ifdef __KERNEL__
# define HZ   CONFIG_HZ /* Internal kernel timer frequency */
# define USER_HZ  100   /* some user interfaces are */
# define CLOCKS_PER_SEC (USER_HZ)       /* in "ticks" like times() */
#endif

#ifndef HZ
#define HZ 100
#endif

2、jiffies

（1）jiffies 的内部表示

// include/linux/jiffies.h
extern u64 __jiffy_data jiffies_64;
extern unsigned long volatile __jiffy_data jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
  return (u64)jiffies;
}
#endif

在 32 位体系结构上是 32 位，在 64 位体系结构上是 64 位。

（2）jiffies 的回绕

#define time_after(a,b)   \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(b) - (long)(a) < 0))
#define time_before(a,b)  time_after(b,a)

#define time_after_eq(a,b)  \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(a) - (long)(b) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)

/*
 * Calculate whether a is in the range of [b, c].
 */
#define time_in_range(a,b,c) \
  (time_after_eq(a,b) && \
   time_before_eq(a,c))

/*
 * Calculate whether a is in the range of [b, c).
 */
#define time_in_range_open(a,b,c) \
  (time_after_eq(a,b) && \
   time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b) \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(b) - (__s64)(a) < 0))
#define time_before64(a,b)  time_after64(b,a)

#define time_after_eq64(a,b)  \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(a) - (__s64)(b) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)

/*
 * These four macros compare jiffies and 'a' for convenience.
 */

/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)

/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)

/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)

/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)

/*
 * Have the 32 bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

（3）用户空间和 HZ

在 2.6 版以前的内核中，如果改变内核中 HZ 的值，会给用户空间中某些程序造成异常结果。这是因为内核是以节拍数 / 秒的形式给用户空间导出这个值的，在这个接口稳定了很长一段时间后，应用程序便逐渐依赖于这个特定的 HZ 值了。所以如果在内核中更改了 HZ 的定义值，就打破了用户空间的常量关系——用户空间并不知道新的 HZ 值。所以用户空间可能认为系统运行时间已经是 20 个小时了，但实际上系统仅仅启动了两个小时。

要像避免上面的错误，内核必须更改所有导出的 jiffies 值。因而内核定义了 USER_HZ 来代表用户空间看到的 HZ 值。在 x86 体系结构上，由于 HZ 值原来一直是 100，所以 USER_HZ 值就定义为 100。内核可以使用函数 jiffies_to_clock_t() 将一个由 HZ 表示的节拍数转换成一个由 USER_HZ 表示的节拍技术。

// kernel/time.c
clock_t jiffies_to_clock_t(long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
  return x * (USER_HZ / HZ);
# else
  return x / (HZ / USER_HZ);
# endif
#else
  return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

// include/linux/types.h
typedef __kernel_clock_t  clock_t;

// arch/x86/include/asm/posix_types_64.h
typedef long    __kernel_clock_t;

3、定时器

// include/linux/timer.h
struct timer_list {
  struct list_head entry;       /* 定时器链表的入口 */
  unsigned long expires;        /* 以 jiffies 为单位的定时值 */
  void (*function)(unsigned long);  /* 定时器处理函数 */
  unsigned long data;         /* 传给处理函数的长整型参数 */
  struct tvec_base *base;       /* 定时器内部值，用户不要使用 */
#ifdef CONFIG_TIMER_STATS
  void *start_site;
  char start_comm[16];
  int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
  struct lockdep_map lockdep_map;
#endif
};

4、延迟执行

（1）忙等待

unsigned long timeout = jiffies + 10;
while (time_before(jiffies, timeout))
  ;

unsigned long timeout = jiffies + 5 * HZ;
while (time_before(jiffies, timeout))
  cond_resched();

（2）短延迟

（a）方法说明

void udelay(unsigned long usecs);
void ndelay(unsigned long nsecs);
void mdelay(unsigned long msecs);

（b）mdelay 函数

mdelay 函数

// include/linux/delay.h
#ifndef mdelay
#define mdelay(n) (\
  (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
  ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
  udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);

static inline void ssleep(unsigned int seconds)
{
  msleep(seconds * 1000);
}

udelay 函数

// arch/x86/include/asm/delay.h
/* 0x10c7 is 2**32 / 1000000 (rounded up) */
#define udelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
  __udelay(n))

/* 0x5 is 2**32 / 1000000000 (rounded up) */
#define ndelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
  __ndelay(n))

__const_udelay 函数

// arch/x86/lib/delay.c
inline void __const_udelay(unsigned long xloops)
{
  int d0;

  xloops *= 4;
  asm("mull %%edx"
    :"=d" (xloops), "=&a" (d0)
    :"1" (xloops), "0"
    (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));

  __delay(++xloops);
}
EXPORT_SYMBOL(__const_udelay);

void __udelay(unsigned long usecs)
{
  __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);

void __ndelay(unsigned long nsecs)
{
  __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);

（3）schedule_timeout 函数

// kernel/timer.c
signed long __sched schedule_timeout(signed long timeout)
{
  struct timer_list timer;
  unsigned long expire;

  switch (timeout)
  {
  case MAX_SCHEDULE_TIMEOUT:
    /*
     * These two special cases are useful to be comfortable
     * in the caller. Nothing more. We could take
     * MAX_SCHEDULE_TIMEOUT from one of the negative value
     * but I' d like to return a valid offset (>=0) to allow
     * the caller to do everything it want with the retval.
     */
    schedule();
    goto out;
  default:
    /*
     * Another bit of PARANOID. Note that the retval will be
     * 0 since no piece of kernel is supposed to do a check
     * for a negative retval of schedule_timeout() (since it
     * should never happens anyway). You just have the printk()
     * that will tell you if something is gone wrong and where.
     */
    if (timeout < 0) {
      printk(KERN_ERR "schedule_timeout: wrong timeout "
        "value %lx\n", timeout);
      dump_stack();
      current->state = TASK_RUNNING;
      goto out;
    }
  }

  expire = timeout + jiffies;

  setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
  __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
  schedule();
  del_singleshot_timer_sync(&timer);

  /* Remove the timer from the object tracker */
  destroy_timer_on_stack(&timer);

  timeout = expire - jiffies;

 out:
  return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

九、内存管理

1、页

// include/linux/mm_types.h
struct page {
  unsigned long flags;    /* Atomic flags, some possibly
           * updated asynchronously */
  atomic_t _count;    /* Usage count, see below. */
  union {
    atomic_t _mapcount; /* Count of ptes mapped in mms,
           * to show when page is mapped
           * & limit reverse map searches.
           */
    struct {    /* SLUB */
      u16 inuse;
      u16 objects;
    };
  };
  union {
      struct {
    unsigned long private;    /* Mapping-private opaque data:
             * usually used for buffer_heads
             * if PagePrivate set; used for
             * swp_entry_t if PageSwapCache;
             * indicates order in the buddy
             * system if PG_buddy is set.
             */
    struct address_space *mapping;  /* If low bit clear, points to
             * inode address_space, or NULL.
             * If page mapped as anonymous
             * memory, low bit is set, and
             * it points to anon_vma object:
             * see PAGE_MAPPING_ANON below.
             */
      };
#if USE_SPLIT_PTLOCKS
      spinlock_t ptl;
#endif
      struct kmem_cache *slab;  /* SLUB: Pointer to slab */
      struct page *first_page;  /* Compound tail pages */
  };
  union {
    pgoff_t index;    /* Our offset within mapping. */
    void *freelist;   /* SLUB: freelist req. slab lock */
  };
  struct list_head lru;   /* Pageout list, eg. active_list
           * protected by zone->lru_lock !
           */
  /*
   * On machines where all RAM is mapped into kernel address space,
   * we can simply calculate the virtual address. On machines with
   * highmem some memory is mapped into kernel virtual memory
   * dynamically, so we need a place to store that address.
   * Note that this field could be 16 bits on x86 ... ;)
   *
   * Architectures with slow multiplication can define
   * WANT_PAGE_VIRTUAL in asm/page.h
   */
#if defined(WANT_PAGE_VIRTUAL)
  void *virtual;      /* Kernel virtual address (NULL if
             not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
  unsigned long debug_flags;  /* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
  /*
   * kmemcheck wants to track the status of each byte in a page; this
   * is a pointer to such a status block. NULL if not tracked.
   */
  void *shadow;
#endif
};

2、区

3、页操作

函数定义在文件 include/linux/gfp.h 中

（1）获取页

（2）释放页

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)

4、kmalloc 函数

kmalloc() 函数与用户空间的 malloc() 一族函数非常类似，只不过它多了一个 flags 参数。它可以获得以字节为单位的一块内核内存。

// include/linux/slab.h
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif

static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
  if (size != 0 && n > ULONG_MAX / size)
    return NULL;
  return __kmalloc(n * size, flags | __GFP_ZERO);
}

（1）gfp_t flags

行为修饰符
区修饰符
类型标志

（2）kmalloc 函数分析

kmalloc 函数

// include/linux/slab_def.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
  struct kmem_cache *cachep;
  void *ret;

  if (__builtin_constant_p(size)) {
    int i = 0;

    if (!size)
      return ZERO_SIZE_PTR;

#define CACHE(x) \
    if (size <= x) \
      goto found; \
    else \
      i++;
#include <linux/kmalloc_sizes.h>
#undef CACHE
    return NULL;
found:
#ifdef CONFIG_ZONE_DMA
    if (flags & GFP_DMA)
      cachep = malloc_sizes[i].cs_dmacachep;
    else
#endif
      cachep = malloc_sizes[i].cs_cachep;

    ret = kmem_cache_alloc_notrace(cachep, flags);

    trace_kmalloc(_THIS_IP_, ret,
            size, slab_buffer_size(cachep), flags);

    return ret;
  }
  return __kmalloc(size, flags);
}

__kmalloc 函数

// mm/slab.c
void *__kmalloc(size_t size, gfp_t flags)
{
  return __do_kmalloc(size, flags, NULL);
}
EXPORT_SYMBOL(__kmalloc);

__do_kmalloc 函数

static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
            void *caller)
{
  struct kmem_cache *cachep;
  void *ret;

  /* If you want to save a few bytes .text space: replace
   * __ with kmem_.
   * Then kmalloc uses the uninlined functions instead of the inline
   * functions.
   */
  cachep = __find_general_cachep(size, flags);
  if (unlikely(ZERO_OR_NULL_PTR(cachep)))
    return cachep;
  ret = __cache_alloc(cachep, flags, caller);

  trace_kmalloc((unsigned long) caller, ret,
          size, cachep->buffer_size, flags);

  return ret;
}

__cache_alloc 函数


static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
  unsigned long save_flags;
  void *objp;

  flags &= gfp_allowed_mask;

  lockdep_trace_alloc(flags);

  if (slab_should_failslab(cachep, flags))
    return NULL;

  cache_alloc_debugcheck_before(cachep, flags);
  local_irq_save(save_flags);
  objp = __do_cache_alloc(cachep, flags);
  local_irq_restore(save_flags);
  objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
  kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
         flags);
  prefetchw(objp);

  if (likely(objp))
    kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

  if (unlikely((flags & __GFP_ZERO) && objp))
    memset(objp, 0, obj_size(cachep));

  return objp;
}

Linux 内核设计与实现3：https://developer.aliyun.com/article/1597350

Linux 内核设计与实现2

6、互斥体

（1）结构体

（2）方法说明

（3）mutex_lock 函数分析

（4）应用比较

7、完成变量

（1）结构体

（2）方法说明

（3）complete 函数分析

8、BLK：大内核锁

（1）方法说明

（2）lock_kernel 函数分析

9、顺序锁

（1）结构体

（2）方法说明

10、禁止抢占

11、顺序与屏障

八、定时器和时间管理

1、节拍率 HZ

2、jiffies

（1）jiffies 的内部表示

（2）jiffies 的回绕

（3）用户空间和 HZ

3、定时器

4、延迟执行

（1）忙等待

（2）短延迟

（a）方法说明

（b）mdelay 函数

（3）schedule_timeout 函数

九、内存管理

1、页

2、区

3、页操作

（1）获取页

（2）释放页

4、kmalloc 函数

（1）gfp_t flags

（2）kmalloc 函数分析

热门文章

最新文章

相关课程

相关电子书

相关实验场景

推荐镜像