Linux 内核设计与实现2

简介: Linux 内核设计与实现

Linux 内核设计与实现1:https://developer.aliyun.com/article/1597346

6、互斥体

(1)结构体

// include/linux/mutex.h
/*
 * Simple, straightforward mutexes with strict semantics:
 *
 * - only one task can hold the mutex at a time
 * - only the owner can unlock the mutex
 * - multiple unlocks are not permitted
 * - recursive locking is not permitted
 * - a mutex object must be initialized via the API
 * - a mutex object must not be initialized via memset or copying
 * - task may not exit with mutex held
 * - memory areas where held locks reside must not be freed
 * - held mutexes must not be reinitialized
 * - mutexes may not be used in hardware or software interrupt
 *   contexts such as tasklets and timers
 *
 * These semantics are fully enforced when DEBUG_MUTEXES is
 * enabled. Furthermore, besides enforcing the above rules, the mutex
 * debugging code also implements a number of additional features
 * that make lock debugging easier and faster:
 *
 * - uses symbolic names of mutexes, whenever they are printed in debug output
 * - point-of-acquire tracking, symbolic lookup of function names
 * - list of all locks held in the system, printout of them
 * - owner tracking
 * - detects self-recursing locks and prints out all relevant info
 * - detects multi-task circular deadlocks and prints out all affected
 *   locks and tasks (and only those tasks)
 */
struct mutex {
  /* 1: unlocked, 0: locked, negative: locked, possible waiters */
  atomic_t    count;
  spinlock_t    wait_lock;
  struct list_head  wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
  struct thread_info  *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
  const char    *name;
  void      *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map  dep_map;
#endif
};

(2)方法说明

   函数定义在文件 include/linux/mutex.h

(3)mutex_lock 函数分析

  1. mutex_lock 函数
// include/linux/mutex.h
extern void mutex_lock(struct mutex *lock);

// kernel/mutex.c
/***
 * mutex_lock - acquire the mutex
 * @lock: the mutex to be acquired
 *
 * Lock the mutex exclusively for this task. If the mutex is not
 * available right now, it will sleep until it can get it.
 *
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
 * memory where the mutex resides mutex must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
 *
 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
 *   checks that will enforce the restrictions and will also do
 *   deadlock debugging. )
 *
 * This function is similar to (but not equivalent to) down().
 */
void __sched mutex_lock(struct mutex *lock)
{
  might_sleep();
  /*
   * The locking fastpath is the 1->0 transition from
   * 'unlocked' into 'locked' state.
   */
  __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
  mutex_set_owner(lock);
}

EXPORT_SYMBOL(mutex_lock);

(4)应用比较

7、完成变量

(1)结构体

// include/linux/completion.h
/**
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
 * INIT_COMPLETION().
 */
struct completion {
  unsigned int done;
  wait_queue_head_t wait;
};


// include/linux/wait.h
struct __wait_queue_head {
  spinlock_t lock;
  struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

struct __wait_queue {
  unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void *private;
  wait_queue_func_t func;
  struct list_head task_list;
};

(2)方法说明

   函数定义在文件 include/linux/completion.h

(3)complete 函数分析

  1. complete 函数
// kernel/sched.c
void complete(struct completion *x)
{
  unsigned long flags;

  spin_lock_irqsave(&x->wait.lock, flags);
  x->done++;
  __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
  spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
  1. __wake_up_common 函数
// kernel/sched.c
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key)
{
  wait_queue_t *curr, *next;

  list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
    unsigned flags = curr->flags;

    if (curr->func(curr, mode, wake_flags, key) &&
        (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
      break;
  }
}

8、BLK:大内核锁

(1)方法说明

   函数定义在文件 include/linux/smp_lock.h

(2)lock_kernel 函数分析

  1. lock_kernel 函数
// include/linux/smp_lock.h
#define lock_kernel() do {          \
  _lock_kernel(__func__, __FILE__, __LINE__);   \
} while (0)
  1. _lock_kernel 函数
// lib/kernel_lock.c
void __lockfunc _lock_kernel(const char *func, const char *file, int line)
{
  int depth = current->lock_depth + 1;

  trace_lock_kernel(func, file, line);

  if (likely(!depth)) {
    might_sleep();
    __lock_kernel();
  }
  current->lock_depth = depth;
}
  1. __lock_kernel 函数
// lib/kernel_lock.c
static inline void __lock_kernel(void)
{
  do_raw_spin_lock(&kernel_flag);
}

9、顺序锁

(1)结构体

// include/linux/seqlock.h
typedef struct {
  unsigned sequence;
  spinlock_t lock;
} seqlock_t;

(2)方法说明

// include/linux/seqlock.h
/*
 * These macros triggered gcc-3.x compile-time problems.  We think these are
 * OK now.  Be cautious.
 */
#define __SEQLOCK_UNLOCKED(lockname) \
     { 0, __SPIN_LOCK_UNLOCKED(lockname) }

#define SEQLOCK_UNLOCKED \
     __SEQLOCK_UNLOCKED(old_style_seqlock_init)

#define seqlock_init(x)         \
  do {            \
    (x)->sequence = 0;      \
    spin_lock_init(&(x)->lock);   \
  } while (0)

#define DEFINE_SEQLOCK(x) \
    seqlock_t x = __SEQLOCK_UNLOCKED(x)

/* Lock out other writers and update the count.
 * Acts like a normal spin_lock/unlock.
 * Don't need preempt_disable() because that is in the spin_lock already.
 */
static inline void write_seqlock(seqlock_t *sl)
{
  spin_lock(&sl->lock);
  ++sl->sequence;
  smp_wmb();
}

static inline void write_sequnlock(seqlock_t *sl)
{
  smp_wmb();
  sl->sequence++;
  spin_unlock(&sl->lock);
}

static inline int write_tryseqlock(seqlock_t *sl)
{
  int ret = spin_trylock(&sl->lock);

  if (ret) {
    ++sl->sequence;
    smp_wmb();
  }
  return ret;
}

/* Start of read calculation -- fetch last complete writer token */
static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
{
  unsigned ret;

repeat:
  ret = sl->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }

  return ret;
}

/*
 * Test if reader processed invalid data.
 *
 * If sequence value changed then writer changed data while in section.
 */
static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
{
  smp_rmb();

  return (sl->sequence != start);
}


/*
 * Version using sequence counter only.
 * This can be used when code has its own mutex protecting the
 * updating starting before the write_seqcountbeqin() and ending
 * after the write_seqcount_end().
 */

typedef struct seqcount {
  unsigned sequence;
} seqcount_t;

#define SEQCNT_ZERO { 0 }
#define seqcount_init(x)  do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)

/* Start of read using pointer to a sequence counter only.  */
static inline unsigned read_seqcount_begin(const seqcount_t *s)
{
  unsigned ret;

repeat:
  ret = s->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }
  return ret;
}

/*
 * Test if reader processed invalid data because sequence number has changed.
 */
static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
{
  smp_rmb();

  return s->sequence != start;
}


/*
 * Sequence counter only version assumes that callers are using their
 * own mutexing.
 */
static inline void write_seqcount_begin(seqcount_t *s)
{
  s->sequence++;
  smp_wmb();
}

static inline void write_seqcount_end(seqcount_t *s)
{
  smp_wmb();
  s->sequence++;
}

/*
 * Possible sw/hw IRQ protected versions of the interfaces.
 */
#define write_seqlock_irqsave(lock, flags)        \
  do { local_irq_save(flags); write_seqlock(lock); } while (0)
#define write_seqlock_irq(lock)           \
  do { local_irq_disable();   write_seqlock(lock); } while (0)
#define write_seqlock_bh(lock)            \
        do { local_bh_disable();    write_seqlock(lock); } while (0)

#define write_sequnlock_irqrestore(lock, flags)       \
  do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
#define write_sequnlock_irq(lock)         \
  do { write_sequnlock(lock); local_irq_enable(); } while(0)
#define write_sequnlock_bh(lock)          \
  do { write_sequnlock(lock); local_bh_enable(); } while(0)

#define read_seqbegin_irqsave(lock, flags)        \
  ({ local_irq_save(flags);   read_seqbegin(lock); })

#define read_seqretry_irqrestore(lock, iv, flags)     \
  ({                \
    int ret = read_seqretry(lock, iv);      \
    local_irq_restore(flags);       \
    ret;              \
  })

10、禁止抢占

11、顺序与屏障

八、定时器和时间管理

1、节拍率 HZ

// include/linux/param.h

#ifndef _LINUX_PARAM_H
#define _LINUX_PARAM_H

#include <asm/param.h>

#endif

// arch/x86/include/asm/param.h
#include <asm-generic/param.h>

// include/asm-generic/param.h
#ifdef __KERNEL__
# define HZ   CONFIG_HZ /* Internal kernel timer frequency */
# define USER_HZ  100   /* some user interfaces are */
# define CLOCKS_PER_SEC (USER_HZ)       /* in "ticks" like times() */
#endif

#ifndef HZ
#define HZ 100
#endif

2、jiffies

(1)jiffies 的内部表示

// include/linux/jiffies.h
extern u64 __jiffy_data jiffies_64;
extern unsigned long volatile __jiffy_data jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
  return (u64)jiffies;
}
#endif


在 32 位体系结构上是 32 位,在 64 位体系结构上是 64 位。

(2)jiffies 的回绕

#define time_after(a,b)   \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(b) - (long)(a) < 0))
#define time_before(a,b)  time_after(b,a)

#define time_after_eq(a,b)  \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(a) - (long)(b) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)

/*
 * Calculate whether a is in the range of [b, c].
 */
#define time_in_range(a,b,c) \
  (time_after_eq(a,b) && \
   time_before_eq(a,c))

/*
 * Calculate whether a is in the range of [b, c).
 */
#define time_in_range_open(a,b,c) \
  (time_after_eq(a,b) && \
   time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b) \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(b) - (__s64)(a) < 0))
#define time_before64(a,b)  time_after64(b,a)

#define time_after_eq64(a,b)  \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(a) - (__s64)(b) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)

/*
 * These four macros compare jiffies and 'a' for convenience.
 */

/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)

/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)

/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)

/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)

/*
 * Have the 32 bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

(3)用户空间和 HZ

  在 2.6 版以前的内核中,如果改变内核中 HZ 的值,会给用户空间中某些程序造成异常结果。这是因为内核是以 节拍数 / 秒 的形式给用户空间导出这个值的,在这个接口稳定了很长一段时间后,应用程序便逐渐依赖于这个特定的 HZ 值了。所以如果在内核中更改了 HZ 的定义值,就打破了用户空间的常量关系——用户空间并不知道新的 HZ 值。所以用户空间可能认为系统运行时间已经是 20 个小时了,但实际上系统仅仅启动了两个小时。

  要像避免上面的错误,内核必须更改所有导出的 jiffies 值。因而内核定义了 USER_HZ 来代表用户空间看到的 HZ 值。在 x86 体系结构上,由于 HZ 值原来一直是 100,所以 USER_HZ 值就定义为 100。内核可以使用函数 jiffies_to_clock_t() 将一个由 HZ 表示的节拍数转换成一个由 USER_HZ 表示的节拍技术。

// kernel/time.c
clock_t jiffies_to_clock_t(long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
  return x * (USER_HZ / HZ);
# else
  return x / (HZ / USER_HZ);
# endif
#else
  return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

// include/linux/types.h
typedef __kernel_clock_t  clock_t;

// arch/x86/include/asm/posix_types_64.h
typedef long    __kernel_clock_t;

3、定时器

// include/linux/timer.h
struct timer_list {
  struct list_head entry;       /* 定时器链表的入口 */
  unsigned long expires;        /* 以 jiffies 为单位的定时值 */
  void (*function)(unsigned long);  /* 定时器处理函数 */
  unsigned long data;         /* 传给处理函数的长整型参数 */
  struct tvec_base *base;       /* 定时器内部值,用户不要使用 */
#ifdef CONFIG_TIMER_STATS
  void *start_site;
  char start_comm[16];
  int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
  struct lockdep_map lockdep_map;
#endif
};

4、延迟执行

(1)忙等待

unsigned long timeout = jiffies + 10;
while (time_before(jiffies, timeout))
  ;
unsigned long timeout = jiffies + 5 * HZ;
while (time_before(jiffies, timeout))
  cond_resched();

(2)短延迟

(a)方法说明
void udelay(unsigned long usecs);
void ndelay(unsigned long nsecs);
void mdelay(unsigned long msecs);


(b)mdelay 函数
  1. mdelay 函数
// include/linux/delay.h
#ifndef mdelay
#define mdelay(n) (\
  (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
  ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
  udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);

static inline void ssleep(unsigned int seconds)
{
  msleep(seconds * 1000);
}
  1. udelay 函数
// arch/x86/include/asm/delay.h
/* 0x10c7 is 2**32 / 1000000 (rounded up) */
#define udelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
  __udelay(n))

/* 0x5 is 2**32 / 1000000000 (rounded up) */
#define ndelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
  __ndelay(n))
  1. __const_udelay 函数
// arch/x86/lib/delay.c
inline void __const_udelay(unsigned long xloops)
{
  int d0;

  xloops *= 4;
  asm("mull %%edx"
    :"=d" (xloops), "=&a" (d0)
    :"1" (xloops), "0"
    (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));

  __delay(++xloops);
}
EXPORT_SYMBOL(__const_udelay);

void __udelay(unsigned long usecs)
{
  __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);

void __ndelay(unsigned long nsecs)
{
  __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);

(3)schedule_timeout 函数

// kernel/timer.c
signed long __sched schedule_timeout(signed long timeout)
{
  struct timer_list timer;
  unsigned long expire;

  switch (timeout)
  {
  case MAX_SCHEDULE_TIMEOUT:
    /*
     * These two special cases are useful to be comfortable
     * in the caller. Nothing more. We could take
     * MAX_SCHEDULE_TIMEOUT from one of the negative value
     * but I' d like to return a valid offset (>=0) to allow
     * the caller to do everything it want with the retval.
     */
    schedule();
    goto out;
  default:
    /*
     * Another bit of PARANOID. Note that the retval will be
     * 0 since no piece of kernel is supposed to do a check
     * for a negative retval of schedule_timeout() (since it
     * should never happens anyway). You just have the printk()
     * that will tell you if something is gone wrong and where.
     */
    if (timeout < 0) {
      printk(KERN_ERR "schedule_timeout: wrong timeout "
        "value %lx\n", timeout);
      dump_stack();
      current->state = TASK_RUNNING;
      goto out;
    }
  }

  expire = timeout + jiffies;

  setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
  __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
  schedule();
  del_singleshot_timer_sync(&timer);

  /* Remove the timer from the object tracker */
  destroy_timer_on_stack(&timer);

  timeout = expire - jiffies;

 out:
  return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

九、内存管理

1、页

// include/linux/mm_types.h
struct page {
  unsigned long flags;    /* Atomic flags, some possibly
           * updated asynchronously */
  atomic_t _count;    /* Usage count, see below. */
  union {
    atomic_t _mapcount; /* Count of ptes mapped in mms,
           * to show when page is mapped
           * & limit reverse map searches.
           */
    struct {    /* SLUB */
      u16 inuse;
      u16 objects;
    };
  };
  union {
      struct {
    unsigned long private;    /* Mapping-private opaque data:
             * usually used for buffer_heads
             * if PagePrivate set; used for
             * swp_entry_t if PageSwapCache;
             * indicates order in the buddy
             * system if PG_buddy is set.
             */
    struct address_space *mapping;  /* If low bit clear, points to
             * inode address_space, or NULL.
             * If page mapped as anonymous
             * memory, low bit is set, and
             * it points to anon_vma object:
             * see PAGE_MAPPING_ANON below.
             */
      };
#if USE_SPLIT_PTLOCKS
      spinlock_t ptl;
#endif
      struct kmem_cache *slab;  /* SLUB: Pointer to slab */
      struct page *first_page;  /* Compound tail pages */
  };
  union {
    pgoff_t index;    /* Our offset within mapping. */
    void *freelist;   /* SLUB: freelist req. slab lock */
  };
  struct list_head lru;   /* Pageout list, eg. active_list
           * protected by zone->lru_lock !
           */
  /*
   * On machines where all RAM is mapped into kernel address space,
   * we can simply calculate the virtual address. On machines with
   * highmem some memory is mapped into kernel virtual memory
   * dynamically, so we need a place to store that address.
   * Note that this field could be 16 bits on x86 ... ;)
   *
   * Architectures with slow multiplication can define
   * WANT_PAGE_VIRTUAL in asm/page.h
   */
#if defined(WANT_PAGE_VIRTUAL)
  void *virtual;      /* Kernel virtual address (NULL if
             not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
  unsigned long debug_flags;  /* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
  /*
   * kmemcheck wants to track the status of each byte in a page; this
   * is a pointer to such a status block. NULL if not tracked.
   */
  void *shadow;
#endif
};

2、区

3、页操作

   函数定义在文件 include/linux/gfp.h

(1)获取页

(2)释放页

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)

4、kmalloc 函数

    kmalloc() 函数与用户空间的 malloc() 一族函数非常类似,只不过它多了一个 flags 参数。它可以获得以字节为单位的一块内核内存。

// include/linux/slab.h
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif

static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
  if (size != 0 && n > ULONG_MAX / size)
    return NULL;
  return __kmalloc(n * size, flags | __GFP_ZERO);
}

(1)gfp_t flags

  1. 行为修饰符

  2. 区修饰符
  3. 类型标志




(2)kmalloc 函数分析

  1. kmalloc 函数
// include/linux/slab_def.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
  struct kmem_cache *cachep;
  void *ret;

  if (__builtin_constant_p(size)) {
    int i = 0;

    if (!size)
      return ZERO_SIZE_PTR;

#define CACHE(x) \
    if (size <= x) \
      goto found; \
    else \
      i++;
#include <linux/kmalloc_sizes.h>
#undef CACHE
    return NULL;
found:
#ifdef CONFIG_ZONE_DMA
    if (flags & GFP_DMA)
      cachep = malloc_sizes[i].cs_dmacachep;
    else
#endif
      cachep = malloc_sizes[i].cs_cachep;

    ret = kmem_cache_alloc_notrace(cachep, flags);

    trace_kmalloc(_THIS_IP_, ret,
            size, slab_buffer_size(cachep), flags);

    return ret;
  }
  return __kmalloc(size, flags);
}
  1. __kmalloc 函数
// mm/slab.c
void *__kmalloc(size_t size, gfp_t flags)
{
  return __do_kmalloc(size, flags, NULL);
}
EXPORT_SYMBOL(__kmalloc);
  1. __do_kmalloc 函数
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
            void *caller)
{
  struct kmem_cache *cachep;
  void *ret;

  /* If you want to save a few bytes .text space: replace
   * __ with kmem_.
   * Then kmalloc uses the uninlined functions instead of the inline
   * functions.
   */
  cachep = __find_general_cachep(size, flags);
  if (unlikely(ZERO_OR_NULL_PTR(cachep)))
    return cachep;
  ret = __cache_alloc(cachep, flags, caller);

  trace_kmalloc((unsigned long) caller, ret,
          size, cachep->buffer_size, flags);

  return ret;
}
  1. __cache_alloc 函数

static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
  unsigned long save_flags;
  void *objp;

  flags &= gfp_allowed_mask;

  lockdep_trace_alloc(flags);

  if (slab_should_failslab(cachep, flags))
    return NULL;

  cache_alloc_debugcheck_before(cachep, flags);
  local_irq_save(save_flags);
  objp = __do_cache_alloc(cachep, flags);
  local_irq_restore(save_flags);
  objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
  kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
         flags);
  prefetchw(objp);

  if (likely(objp))
    kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

  if (unlikely((flags & __GFP_ZERO) && objp))
    memset(objp, 0, obj_size(cachep));

  return objp;
}

Linux 内核设计与实现3:https://developer.aliyun.com/article/1597350

目录
相关文章
|
14天前
|
安全 Linux 编译器
探索Linux内核的奥秘:从零构建操作系统####
本文旨在通过深入浅出的方式,带领读者踏上一段从零开始构建简化版Linux操作系统的旅程。我们将避开复杂的技术细节,以通俗易懂的语言,逐步揭开Linux内核的神秘面纱,探讨其工作原理、核心组件及如何通过实践加深理解。这既是一次对操作系统原理的深刻洞察,也是一场激发创新思维与实践能力的冒险。 ####
|
2天前
|
算法 Linux 开发者
深入探究Linux内核中的内存管理机制
本文旨在对Linux操作系统的内存管理机制进行深入分析,探讨其如何通过高效的内存分配和回收策略来优化系统性能。文章将详细介绍Linux内核中内存管理的关键技术点,包括物理内存与虚拟内存的映射、页面置换算法、以及内存碎片的处理方法等。通过对这些技术点的解析,本文旨在为读者提供一个清晰的Linux内存管理框架,帮助理解其在现代计算环境中的重要性和应用。
|
2天前
|
人工智能 算法 大数据
Linux内核中的调度算法演变:从O(1)到CFS的优化之旅###
本文深入探讨了Linux操作系统内核中进程调度算法的发展历程,聚焦于O(1)调度器向完全公平调度器(CFS)的转变。不同于传统摘要对研究背景、方法、结果和结论的概述,本文创新性地采用“技术演进时间线”的形式,简明扼要地勾勒出这一转变背后的关键技术里程碑,旨在为读者提供一个清晰的历史脉络,引领其深入了解Linux调度机制的革新之路。 ###
|
4天前
|
算法 Linux 定位技术
Linux内核中的进程调度算法解析####
【10月更文挑战第29天】 本文深入剖析了Linux操作系统的心脏——内核中至关重要的组成部分之一,即进程调度机制。不同于传统的摘要概述,我们将通过一段引人入胜的故事线来揭开进程调度算法的神秘面纱,展现其背后的精妙设计与复杂逻辑,让读者仿佛跟随一位虚拟的“进程侦探”,一步步探索Linux如何高效、公平地管理众多进程,确保系统资源的最优分配与利用。 ####
24 4
|
5天前
|
缓存 负载均衡 算法
Linux内核中的进程调度算法解析####
本文深入探讨了Linux操作系统核心组件之一——进程调度器,着重分析了其采用的CFS(完全公平调度器)算法。不同于传统摘要对研究背景、方法、结果和结论的概述,本文摘要将直接揭示CFS算法的核心优势及其在现代多核处理器环境下如何实现高效、公平的资源分配,同时简要提及该算法如何优化系统响应时间和吞吐量,为读者快速构建对Linux进程调度机制的认知框架。 ####
|
8天前
|
缓存 Linux
揭秘Linux内核:探索CPU拓扑结构
【10月更文挑战第26天】
23 1
|
8天前
|
缓存 运维 Linux
深入探索Linux内核:CPU拓扑结构探测
【10月更文挑战第18天】在现代计算机系统中,CPU的拓扑结构对性能优化和资源管理至关重要。了解CPU的核心、线程、NUMA节点等信息,可以帮助开发者和系统管理员更好地调优应用程序和系统配置。本文将深入探讨如何在Linux内核中探测CPU拓扑结构,介绍相关工具和方法。
9 0
|
17天前
|
网络协议 Linux 调度
深入探索Linux操作系统的心脏:内核与系统调用####
本文旨在揭开Linux操作系统中最为核心的部分——内核与系统调用的神秘面纱,通过生动形象的语言和比喻,让读者仿佛踏上了一段奇妙的旅程,从宏观到微观,逐步深入了解这两个关键组件如何协同工作,支撑起整个操作系统的运行。不同于传统的技术解析,本文将以故事化的方式,带领读者领略Linux内核的精妙设计与系统调用的魅力所在,即便是对技术细节不甚了解的读者也能轻松享受这次知识之旅。 ####
|
13天前
|
缓存 算法 安全
深入理解Linux操作系统的心脏:内核与系统调用####
【10月更文挑战第20天】 本文将带你探索Linux操作系统的核心——其强大的内核和高效的系统调用机制。通过深入浅出的解释,我们将揭示这些技术是如何协同工作以支撑起整个系统的运行,同时也会触及一些常见的误解和背后的哲学思想。无论你是开发者、系统管理员还是普通用户,了解这些基础知识都将有助于你更好地利用Linux的强大功能。 ####
24 1
|
14天前
|
缓存 编解码 监控
深入探索Linux内核调度机制的奥秘###
【10月更文挑战第19天】 本文旨在以通俗易懂的语言,深入浅出地剖析Linux操作系统内核中的进程调度机制,揭示其背后的设计哲学与实现策略。我们将从基础概念入手,逐步揭开Linux调度策略的神秘面纱,探讨其如何高效、公平地管理系统资源,以及这些机制对系统性能和用户体验的影响。通过本文,您将获得关于Linux调度机制的全新视角,理解其在日常计算中扮演的关键角色。 ###
42 1