Linux 内核设计与实现2

简介: Linux 内核设计与实现

Linux 内核设计与实现1:https://developer.aliyun.com/article/1597346

6、互斥体

(1)结构体

// include/linux/mutex.h
/*
 * Simple, straightforward mutexes with strict semantics:
 *
 * - only one task can hold the mutex at a time
 * - only the owner can unlock the mutex
 * - multiple unlocks are not permitted
 * - recursive locking is not permitted
 * - a mutex object must be initialized via the API
 * - a mutex object must not be initialized via memset or copying
 * - task may not exit with mutex held
 * - memory areas where held locks reside must not be freed
 * - held mutexes must not be reinitialized
 * - mutexes may not be used in hardware or software interrupt
 *   contexts such as tasklets and timers
 *
 * These semantics are fully enforced when DEBUG_MUTEXES is
 * enabled. Furthermore, besides enforcing the above rules, the mutex
 * debugging code also implements a number of additional features
 * that make lock debugging easier and faster:
 *
 * - uses symbolic names of mutexes, whenever they are printed in debug output
 * - point-of-acquire tracking, symbolic lookup of function names
 * - list of all locks held in the system, printout of them
 * - owner tracking
 * - detects self-recursing locks and prints out all relevant info
 * - detects multi-task circular deadlocks and prints out all affected
 *   locks and tasks (and only those tasks)
 */
struct mutex {
  /* 1: unlocked, 0: locked, negative: locked, possible waiters */
  atomic_t    count;
  spinlock_t    wait_lock;
  struct list_head  wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
  struct thread_info  *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
  const char    *name;
  void      *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map  dep_map;
#endif
};

(2)方法说明

   函数定义在文件 include/linux/mutex.h

(3)mutex_lock 函数分析

  1. mutex_lock 函数
// include/linux/mutex.h
extern void mutex_lock(struct mutex *lock);

// kernel/mutex.c
/***
 * mutex_lock - acquire the mutex
 * @lock: the mutex to be acquired
 *
 * Lock the mutex exclusively for this task. If the mutex is not
 * available right now, it will sleep until it can get it.
 *
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
 * memory where the mutex resides mutex must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
 *
 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
 *   checks that will enforce the restrictions and will also do
 *   deadlock debugging. )
 *
 * This function is similar to (but not equivalent to) down().
 */
void __sched mutex_lock(struct mutex *lock)
{
  might_sleep();
  /*
   * The locking fastpath is the 1->0 transition from
   * 'unlocked' into 'locked' state.
   */
  __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
  mutex_set_owner(lock);
}

EXPORT_SYMBOL(mutex_lock);

(4)应用比较

7、完成变量

(1)结构体

// include/linux/completion.h
/**
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
 * INIT_COMPLETION().
 */
struct completion {
  unsigned int done;
  wait_queue_head_t wait;
};


// include/linux/wait.h
struct __wait_queue_head {
  spinlock_t lock;
  struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

struct __wait_queue {
  unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void *private;
  wait_queue_func_t func;
  struct list_head task_list;
};

(2)方法说明

   函数定义在文件 include/linux/completion.h

(3)complete 函数分析

  1. complete 函数
// kernel/sched.c
void complete(struct completion *x)
{
  unsigned long flags;

  spin_lock_irqsave(&x->wait.lock, flags);
  x->done++;
  __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
  spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
  1. __wake_up_common 函数
// kernel/sched.c
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key)
{
  wait_queue_t *curr, *next;

  list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
    unsigned flags = curr->flags;

    if (curr->func(curr, mode, wake_flags, key) &&
        (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
      break;
  }
}

8、BLK:大内核锁

(1)方法说明

   函数定义在文件 include/linux/smp_lock.h

(2)lock_kernel 函数分析

  1. lock_kernel 函数
// include/linux/smp_lock.h
#define lock_kernel() do {          \
  _lock_kernel(__func__, __FILE__, __LINE__);   \
} while (0)
  1. _lock_kernel 函数
// lib/kernel_lock.c
void __lockfunc _lock_kernel(const char *func, const char *file, int line)
{
  int depth = current->lock_depth + 1;

  trace_lock_kernel(func, file, line);

  if (likely(!depth)) {
    might_sleep();
    __lock_kernel();
  }
  current->lock_depth = depth;
}
  1. __lock_kernel 函数
// lib/kernel_lock.c
static inline void __lock_kernel(void)
{
  do_raw_spin_lock(&kernel_flag);
}

9、顺序锁

(1)结构体

// include/linux/seqlock.h
typedef struct {
  unsigned sequence;
  spinlock_t lock;
} seqlock_t;

(2)方法说明

// include/linux/seqlock.h
/*
 * These macros triggered gcc-3.x compile-time problems.  We think these are
 * OK now.  Be cautious.
 */
#define __SEQLOCK_UNLOCKED(lockname) \
     { 0, __SPIN_LOCK_UNLOCKED(lockname) }

#define SEQLOCK_UNLOCKED \
     __SEQLOCK_UNLOCKED(old_style_seqlock_init)

#define seqlock_init(x)         \
  do {            \
    (x)->sequence = 0;      \
    spin_lock_init(&(x)->lock);   \
  } while (0)

#define DEFINE_SEQLOCK(x) \
    seqlock_t x = __SEQLOCK_UNLOCKED(x)

/* Lock out other writers and update the count.
 * Acts like a normal spin_lock/unlock.
 * Don't need preempt_disable() because that is in the spin_lock already.
 */
static inline void write_seqlock(seqlock_t *sl)
{
  spin_lock(&sl->lock);
  ++sl->sequence;
  smp_wmb();
}

static inline void write_sequnlock(seqlock_t *sl)
{
  smp_wmb();
  sl->sequence++;
  spin_unlock(&sl->lock);
}

static inline int write_tryseqlock(seqlock_t *sl)
{
  int ret = spin_trylock(&sl->lock);

  if (ret) {
    ++sl->sequence;
    smp_wmb();
  }
  return ret;
}

/* Start of read calculation -- fetch last complete writer token */
static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
{
  unsigned ret;

repeat:
  ret = sl->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }

  return ret;
}

/*
 * Test if reader processed invalid data.
 *
 * If sequence value changed then writer changed data while in section.
 */
static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
{
  smp_rmb();

  return (sl->sequence != start);
}


/*
 * Version using sequence counter only.
 * This can be used when code has its own mutex protecting the
 * updating starting before the write_seqcountbeqin() and ending
 * after the write_seqcount_end().
 */

typedef struct seqcount {
  unsigned sequence;
} seqcount_t;

#define SEQCNT_ZERO { 0 }
#define seqcount_init(x)  do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)

/* Start of read using pointer to a sequence counter only.  */
static inline unsigned read_seqcount_begin(const seqcount_t *s)
{
  unsigned ret;

repeat:
  ret = s->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }
  return ret;
}

/*
 * Test if reader processed invalid data because sequence number has changed.
 */
static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
{
  smp_rmb();

  return s->sequence != start;
}


/*
 * Sequence counter only version assumes that callers are using their
 * own mutexing.
 */
static inline void write_seqcount_begin(seqcount_t *s)
{
  s->sequence++;
  smp_wmb();
}

static inline void write_seqcount_end(seqcount_t *s)
{
  smp_wmb();
  s->sequence++;
}

/*
 * Possible sw/hw IRQ protected versions of the interfaces.
 */
#define write_seqlock_irqsave(lock, flags)        \
  do { local_irq_save(flags); write_seqlock(lock); } while (0)
#define write_seqlock_irq(lock)           \
  do { local_irq_disable();   write_seqlock(lock); } while (0)
#define write_seqlock_bh(lock)            \
        do { local_bh_disable();    write_seqlock(lock); } while (0)

#define write_sequnlock_irqrestore(lock, flags)       \
  do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
#define write_sequnlock_irq(lock)         \
  do { write_sequnlock(lock); local_irq_enable(); } while(0)
#define write_sequnlock_bh(lock)          \
  do { write_sequnlock(lock); local_bh_enable(); } while(0)

#define read_seqbegin_irqsave(lock, flags)        \
  ({ local_irq_save(flags);   read_seqbegin(lock); })

#define read_seqretry_irqrestore(lock, iv, flags)     \
  ({                \
    int ret = read_seqretry(lock, iv);      \
    local_irq_restore(flags);       \
    ret;              \
  })

10、禁止抢占

11、顺序与屏障

八、定时器和时间管理

1、节拍率 HZ

// include/linux/param.h

#ifndef _LINUX_PARAM_H
#define _LINUX_PARAM_H

#include <asm/param.h>

#endif

// arch/x86/include/asm/param.h
#include <asm-generic/param.h>

// include/asm-generic/param.h
#ifdef __KERNEL__
# define HZ   CONFIG_HZ /* Internal kernel timer frequency */
# define USER_HZ  100   /* some user interfaces are */
# define CLOCKS_PER_SEC (USER_HZ)       /* in "ticks" like times() */
#endif

#ifndef HZ
#define HZ 100
#endif

2、jiffies

(1)jiffies 的内部表示

// include/linux/jiffies.h
extern u64 __jiffy_data jiffies_64;
extern unsigned long volatile __jiffy_data jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
  return (u64)jiffies;
}
#endif


在 32 位体系结构上是 32 位,在 64 位体系结构上是 64 位。

(2)jiffies 的回绕

#define time_after(a,b)   \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(b) - (long)(a) < 0))
#define time_before(a,b)  time_after(b,a)

#define time_after_eq(a,b)  \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(a) - (long)(b) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)

/*
 * Calculate whether a is in the range of [b, c].
 */
#define time_in_range(a,b,c) \
  (time_after_eq(a,b) && \
   time_before_eq(a,c))

/*
 * Calculate whether a is in the range of [b, c).
 */
#define time_in_range_open(a,b,c) \
  (time_after_eq(a,b) && \
   time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b) \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(b) - (__s64)(a) < 0))
#define time_before64(a,b)  time_after64(b,a)

#define time_after_eq64(a,b)  \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(a) - (__s64)(b) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)

/*
 * These four macros compare jiffies and 'a' for convenience.
 */

/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)

/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)

/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)

/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)

/*
 * Have the 32 bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

(3)用户空间和 HZ

  在 2.6 版以前的内核中,如果改变内核中 HZ 的值,会给用户空间中某些程序造成异常结果。这是因为内核是以 节拍数 / 秒 的形式给用户空间导出这个值的,在这个接口稳定了很长一段时间后,应用程序便逐渐依赖于这个特定的 HZ 值了。所以如果在内核中更改了 HZ 的定义值,就打破了用户空间的常量关系——用户空间并不知道新的 HZ 值。所以用户空间可能认为系统运行时间已经是 20 个小时了,但实际上系统仅仅启动了两个小时。

  要像避免上面的错误,内核必须更改所有导出的 jiffies 值。因而内核定义了 USER_HZ 来代表用户空间看到的 HZ 值。在 x86 体系结构上,由于 HZ 值原来一直是 100,所以 USER_HZ 值就定义为 100。内核可以使用函数 jiffies_to_clock_t() 将一个由 HZ 表示的节拍数转换成一个由 USER_HZ 表示的节拍技术。

// kernel/time.c
clock_t jiffies_to_clock_t(long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
  return x * (USER_HZ / HZ);
# else
  return x / (HZ / USER_HZ);
# endif
#else
  return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

// include/linux/types.h
typedef __kernel_clock_t  clock_t;

// arch/x86/include/asm/posix_types_64.h
typedef long    __kernel_clock_t;

3、定时器

// include/linux/timer.h
struct timer_list {
  struct list_head entry;       /* 定时器链表的入口 */
  unsigned long expires;        /* 以 jiffies 为单位的定时值 */
  void (*function)(unsigned long);  /* 定时器处理函数 */
  unsigned long data;         /* 传给处理函数的长整型参数 */
  struct tvec_base *base;       /* 定时器内部值,用户不要使用 */
#ifdef CONFIG_TIMER_STATS
  void *start_site;
  char start_comm[16];
  int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
  struct lockdep_map lockdep_map;
#endif
};

4、延迟执行

(1)忙等待

unsigned long timeout = jiffies + 10;
while (time_before(jiffies, timeout))
  ;
unsigned long timeout = jiffies + 5 * HZ;
while (time_before(jiffies, timeout))
  cond_resched();

(2)短延迟

(a)方法说明
void udelay(unsigned long usecs);
void ndelay(unsigned long nsecs);
void mdelay(unsigned long msecs);


(b)mdelay 函数
  1. mdelay 函数
// include/linux/delay.h
#ifndef mdelay
#define mdelay(n) (\
  (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
  ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
  udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);

static inline void ssleep(unsigned int seconds)
{
  msleep(seconds * 1000);
}
  1. udelay 函数
// arch/x86/include/asm/delay.h
/* 0x10c7 is 2**32 / 1000000 (rounded up) */
#define udelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
  __udelay(n))

/* 0x5 is 2**32 / 1000000000 (rounded up) */
#define ndelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
  __ndelay(n))
  1. __const_udelay 函数
// arch/x86/lib/delay.c
inline void __const_udelay(unsigned long xloops)
{
  int d0;

  xloops *= 4;
  asm("mull %%edx"
    :"=d" (xloops), "=&a" (d0)
    :"1" (xloops), "0"
    (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));

  __delay(++xloops);
}
EXPORT_SYMBOL(__const_udelay);

void __udelay(unsigned long usecs)
{
  __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);

void __ndelay(unsigned long nsecs)
{
  __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);

(3)schedule_timeout 函数

// kernel/timer.c
signed long __sched schedule_timeout(signed long timeout)
{
  struct timer_list timer;
  unsigned long expire;

  switch (timeout)
  {
  case MAX_SCHEDULE_TIMEOUT:
    /*
     * These two special cases are useful to be comfortable
     * in the caller. Nothing more. We could take
     * MAX_SCHEDULE_TIMEOUT from one of the negative value
     * but I' d like to return a valid offset (>=0) to allow
     * the caller to do everything it want with the retval.
     */
    schedule();
    goto out;
  default:
    /*
     * Another bit of PARANOID. Note that the retval will be
     * 0 since no piece of kernel is supposed to do a check
     * for a negative retval of schedule_timeout() (since it
     * should never happens anyway). You just have the printk()
     * that will tell you if something is gone wrong and where.
     */
    if (timeout < 0) {
      printk(KERN_ERR "schedule_timeout: wrong timeout "
        "value %lx\n", timeout);
      dump_stack();
      current->state = TASK_RUNNING;
      goto out;
    }
  }

  expire = timeout + jiffies;

  setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
  __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
  schedule();
  del_singleshot_timer_sync(&timer);

  /* Remove the timer from the object tracker */
  destroy_timer_on_stack(&timer);

  timeout = expire - jiffies;

 out:
  return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

九、内存管理

1、页

// include/linux/mm_types.h
struct page {
  unsigned long flags;    /* Atomic flags, some possibly
           * updated asynchronously */
  atomic_t _count;    /* Usage count, see below. */
  union {
    atomic_t _mapcount; /* Count of ptes mapped in mms,
           * to show when page is mapped
           * & limit reverse map searches.
           */
    struct {    /* SLUB */
      u16 inuse;
      u16 objects;
    };
  };
  union {
      struct {
    unsigned long private;    /* Mapping-private opaque data:
             * usually used for buffer_heads
             * if PagePrivate set; used for
             * swp_entry_t if PageSwapCache;
             * indicates order in the buddy
             * system if PG_buddy is set.
             */
    struct address_space *mapping;  /* If low bit clear, points to
             * inode address_space, or NULL.
             * If page mapped as anonymous
             * memory, low bit is set, and
             * it points to anon_vma object:
             * see PAGE_MAPPING_ANON below.
             */
      };
#if USE_SPLIT_PTLOCKS
      spinlock_t ptl;
#endif
      struct kmem_cache *slab;  /* SLUB: Pointer to slab */
      struct page *first_page;  /* Compound tail pages */
  };
  union {
    pgoff_t index;    /* Our offset within mapping. */
    void *freelist;   /* SLUB: freelist req. slab lock */
  };
  struct list_head lru;   /* Pageout list, eg. active_list
           * protected by zone->lru_lock !
           */
  /*
   * On machines where all RAM is mapped into kernel address space,
   * we can simply calculate the virtual address. On machines with
   * highmem some memory is mapped into kernel virtual memory
   * dynamically, so we need a place to store that address.
   * Note that this field could be 16 bits on x86 ... ;)
   *
   * Architectures with slow multiplication can define
   * WANT_PAGE_VIRTUAL in asm/page.h
   */
#if defined(WANT_PAGE_VIRTUAL)
  void *virtual;      /* Kernel virtual address (NULL if
             not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
  unsigned long debug_flags;  /* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
  /*
   * kmemcheck wants to track the status of each byte in a page; this
   * is a pointer to such a status block. NULL if not tracked.
   */
  void *shadow;
#endif
};

2、区

3、页操作

   函数定义在文件 include/linux/gfp.h

(1)获取页

(2)释放页

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)

4、kmalloc 函数

    kmalloc() 函数与用户空间的 malloc() 一族函数非常类似,只不过它多了一个 flags 参数。它可以获得以字节为单位的一块内核内存。

// include/linux/slab.h
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif

static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
  if (size != 0 && n > ULONG_MAX / size)
    return NULL;
  return __kmalloc(n * size, flags | __GFP_ZERO);
}

(1)gfp_t flags

  1. 行为修饰符

  2. 区修饰符
  3. 类型标志




(2)kmalloc 函数分析

  1. kmalloc 函数
// include/linux/slab_def.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
  struct kmem_cache *cachep;
  void *ret;

  if (__builtin_constant_p(size)) {
    int i = 0;

    if (!size)
      return ZERO_SIZE_PTR;

#define CACHE(x) \
    if (size <= x) \
      goto found; \
    else \
      i++;
#include <linux/kmalloc_sizes.h>
#undef CACHE
    return NULL;
found:
#ifdef CONFIG_ZONE_DMA
    if (flags & GFP_DMA)
      cachep = malloc_sizes[i].cs_dmacachep;
    else
#endif
      cachep = malloc_sizes[i].cs_cachep;

    ret = kmem_cache_alloc_notrace(cachep, flags);

    trace_kmalloc(_THIS_IP_, ret,
            size, slab_buffer_size(cachep), flags);

    return ret;
  }
  return __kmalloc(size, flags);
}
  1. __kmalloc 函数
// mm/slab.c
void *__kmalloc(size_t size, gfp_t flags)
{
  return __do_kmalloc(size, flags, NULL);
}
EXPORT_SYMBOL(__kmalloc);
  1. __do_kmalloc 函数
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
            void *caller)
{
  struct kmem_cache *cachep;
  void *ret;

  /* If you want to save a few bytes .text space: replace
   * __ with kmem_.
   * Then kmalloc uses the uninlined functions instead of the inline
   * functions.
   */
  cachep = __find_general_cachep(size, flags);
  if (unlikely(ZERO_OR_NULL_PTR(cachep)))
    return cachep;
  ret = __cache_alloc(cachep, flags, caller);

  trace_kmalloc((unsigned long) caller, ret,
          size, cachep->buffer_size, flags);

  return ret;
}
  1. __cache_alloc 函数

static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
  unsigned long save_flags;
  void *objp;

  flags &= gfp_allowed_mask;

  lockdep_trace_alloc(flags);

  if (slab_should_failslab(cachep, flags))
    return NULL;

  cache_alloc_debugcheck_before(cachep, flags);
  local_irq_save(save_flags);
  objp = __do_cache_alloc(cachep, flags);
  local_irq_restore(save_flags);
  objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
  kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
         flags);
  prefetchw(objp);

  if (likely(objp))
    kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

  if (unlikely((flags & __GFP_ZERO) && objp))
    memset(objp, 0, obj_size(cachep));

  return objp;
}

Linux 内核设计与实现3:https://developer.aliyun.com/article/1597350

目录
相关文章
|
13天前
|
算法 Linux 调度
深入理解Linux内核调度器:从基础到优化####
本文旨在通过剖析Linux操作系统的心脏——内核调度器,为读者揭开其高效管理CPU资源的神秘面纱。不同于传统的摘要概述,本文将直接以一段精简代码片段作为引子,展示一个简化版的任务调度逻辑,随后逐步深入,详细探讨Linux内核调度器的工作原理、关键数据结构、调度算法演变以及性能调优策略,旨在为开发者与系统管理员提供一份实用的技术指南。 ####
50 4
|
17天前
|
缓存 算法 Linux
深入理解Linux内核调度器:公平性与性能的平衡####
真知灼见 本文将带你深入了解Linux操作系统的核心组件之一——完全公平调度器(CFS),通过剖析其设计原理、工作机制以及在实际系统中的应用效果,揭示它是如何在众多进程间实现资源分配的公平性与高效性的。不同于传统的摘要概述,本文旨在通过直观且富有洞察力的视角,让读者仿佛亲身体验到CFS在复杂系统环境中游刃有余地进行任务调度的过程。 ####
37 6
|
2天前
|
缓存 网络协议 Linux
深入探索Linux操作系统的内核优化策略####
本文旨在探讨Linux操作系统内核的优化方法,通过分析当前主流的几种内核优化技术,结合具体案例,阐述如何有效提升系统性能与稳定性。文章首先概述了Linux内核的基本结构,随后详细解析了内核优化的必要性及常用手段,包括编译优化、内核参数调整、内存管理优化等,最后通过实例展示了这些优化技巧在实际场景中的应用效果,为读者提供了一套实用的Linux内核优化指南。 ####
11 1
|
7天前
|
算法 Linux 开发者
Linux内核中的锁机制:保障并发控制的艺术####
本文深入探讨了Linux操作系统内核中实现的多种锁机制,包括自旋锁、互斥锁、读写锁等,旨在揭示这些同步原语如何高效地解决资源竞争问题,保证系统的稳定性和性能。通过分析不同锁机制的工作原理及应用场景,本文为开发者提供了在高并发环境下进行有效并发控制的实用指南。 ####
|
15天前
|
缓存 资源调度 安全
深入探索Linux操作系统的心脏——内核配置与优化####
本文作为一篇技术性深度解析文章,旨在引领读者踏上一场揭秘Linux内核配置与优化的奇妙之旅。不同于传统的摘要概述,本文将以实战为导向,直接跳入核心内容,探讨如何通过精细调整内核参数来提升系统性能、增强安全性及实现资源高效利用。从基础概念到高级技巧,逐步揭示那些隐藏在命令行背后的强大功能,为系统管理员和高级用户打开一扇通往极致性能与定制化体验的大门。 --- ###
44 9
|
14天前
|
缓存 负载均衡 Linux
深入理解Linux内核调度器
本文探讨了Linux操作系统核心组件之一——内核调度器的工作原理和设计哲学。不同于常规的技术文章,本摘要旨在提供一种全新的视角来审视Linux内核的调度机制,通过分析其对系统性能的影响以及在多核处理器环境下的表现,揭示调度器如何平衡公平性和效率。文章进一步讨论了完全公平调度器(CFS)的设计细节,包括它如何处理不同优先级的任务、如何进行负载均衡以及它是如何适应现代多核架构的挑战。此外,本文还简要概述了Linux调度器的未来发展方向,包括对实时任务支持的改进和对异构计算环境的适应性。
37 6
|
15天前
|
缓存 Linux 开发者
Linux内核中的并发控制机制:深入理解与应用####
【10月更文挑战第21天】 本文旨在为读者提供一个全面的指南,探讨Linux操作系统中用于实现多线程和进程间同步的关键技术——并发控制机制。通过剖析互斥锁、自旋锁、读写锁等核心概念及其在实际场景中的应用,本文将帮助开发者更好地理解和运用这些工具来构建高效且稳定的应用程序。 ####
35 5
|
15天前
|
算法 Unix Linux
深入理解Linux内核调度器:原理与优化
本文探讨了Linux操作系统的心脏——内核调度器(Scheduler)的工作原理,以及如何通过参数调整和代码优化来提高系统性能。不同于常规摘要仅概述内容,本摘要旨在激发读者对Linux内核调度机制深层次运作的兴趣,并简要介绍文章将覆盖的关键话题,如调度算法、实时性增强及节能策略等。
|
16天前
|
存储 监控 安全
Linux内核调优的艺术:从基础到高级###
本文深入探讨了Linux操作系统的心脏——内核的调优方法。文章首先概述了Linux内核的基本结构与工作原理,随后详细阐述了内核调优的重要性及基本原则。通过具体的参数调整示例(如sysctl、/proc/sys目录中的设置),文章展示了如何根据实际应用场景优化系统性能,包括提升CPU利用率、内存管理效率以及I/O性能等关键方面。最后,介绍了一些高级工具和技术,如perf、eBPF和SystemTap,用于更深层次的性能分析和问题定位。本文旨在为系统管理员和高级用户提供实用的内核调优策略,以最大化Linux系统的效率和稳定性。 ###
|
15天前
|
Java Linux Android开发
深入探索Android系统架构:从Linux内核到应用层
本文将带领读者深入了解Android操作系统的复杂架构,从其基于Linux的内核到丰富多彩的应用层。我们将探讨Android的各个关键组件,包括硬件抽象层(HAL)、运行时环境、以及核心库等,揭示它们如何协同工作以支持广泛的设备和应用。通过本文,您将对Android系统的工作原理有一个全面的认识,理解其如何平衡开放性与安全性,以及如何在多样化的设备上提供一致的用户体验。