Linux 内核设计与实现2

简介: Linux 内核设计与实现

Linux 内核设计与实现1:https://developer.aliyun.com/article/1597346

6、互斥体

(1)结构体

// include/linux/mutex.h
/*
 * Simple, straightforward mutexes with strict semantics:
 *
 * - only one task can hold the mutex at a time
 * - only the owner can unlock the mutex
 * - multiple unlocks are not permitted
 * - recursive locking is not permitted
 * - a mutex object must be initialized via the API
 * - a mutex object must not be initialized via memset or copying
 * - task may not exit with mutex held
 * - memory areas where held locks reside must not be freed
 * - held mutexes must not be reinitialized
 * - mutexes may not be used in hardware or software interrupt
 *   contexts such as tasklets and timers
 *
 * These semantics are fully enforced when DEBUG_MUTEXES is
 * enabled. Furthermore, besides enforcing the above rules, the mutex
 * debugging code also implements a number of additional features
 * that make lock debugging easier and faster:
 *
 * - uses symbolic names of mutexes, whenever they are printed in debug output
 * - point-of-acquire tracking, symbolic lookup of function names
 * - list of all locks held in the system, printout of them
 * - owner tracking
 * - detects self-recursing locks and prints out all relevant info
 * - detects multi-task circular deadlocks and prints out all affected
 *   locks and tasks (and only those tasks)
 */
struct mutex {
  /* 1: unlocked, 0: locked, negative: locked, possible waiters */
  atomic_t    count;
  spinlock_t    wait_lock;
  struct list_head  wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
  struct thread_info  *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
  const char    *name;
  void      *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map  dep_map;
#endif
};

(2)方法说明

   函数定义在文件 include/linux/mutex.h

(3)mutex_lock 函数分析

  1. mutex_lock 函数
// include/linux/mutex.h
extern void mutex_lock(struct mutex *lock);

// kernel/mutex.c
/***
 * mutex_lock - acquire the mutex
 * @lock: the mutex to be acquired
 *
 * Lock the mutex exclusively for this task. If the mutex is not
 * available right now, it will sleep until it can get it.
 *
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
 * memory where the mutex resides mutex must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
 *
 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
 *   checks that will enforce the restrictions and will also do
 *   deadlock debugging. )
 *
 * This function is similar to (but not equivalent to) down().
 */
void __sched mutex_lock(struct mutex *lock)
{
  might_sleep();
  /*
   * The locking fastpath is the 1->0 transition from
   * 'unlocked' into 'locked' state.
   */
  __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
  mutex_set_owner(lock);
}

EXPORT_SYMBOL(mutex_lock);

(4)应用比较

7、完成变量

(1)结构体

// include/linux/completion.h
/**
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
 * INIT_COMPLETION().
 */
struct completion {
  unsigned int done;
  wait_queue_head_t wait;
};


// include/linux/wait.h
struct __wait_queue_head {
  spinlock_t lock;
  struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

struct __wait_queue {
  unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void *private;
  wait_queue_func_t func;
  struct list_head task_list;
};

(2)方法说明

   函数定义在文件 include/linux/completion.h

(3)complete 函数分析

  1. complete 函数
// kernel/sched.c
void complete(struct completion *x)
{
  unsigned long flags;

  spin_lock_irqsave(&x->wait.lock, flags);
  x->done++;
  __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
  spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
  1. __wake_up_common 函数
// kernel/sched.c
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key)
{
  wait_queue_t *curr, *next;

  list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
    unsigned flags = curr->flags;

    if (curr->func(curr, mode, wake_flags, key) &&
        (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
      break;
  }
}

8、BLK:大内核锁

(1)方法说明

   函数定义在文件 include/linux/smp_lock.h

(2)lock_kernel 函数分析

  1. lock_kernel 函数
// include/linux/smp_lock.h
#define lock_kernel() do {          \
  _lock_kernel(__func__, __FILE__, __LINE__);   \
} while (0)
  1. _lock_kernel 函数
// lib/kernel_lock.c
void __lockfunc _lock_kernel(const char *func, const char *file, int line)
{
  int depth = current->lock_depth + 1;

  trace_lock_kernel(func, file, line);

  if (likely(!depth)) {
    might_sleep();
    __lock_kernel();
  }
  current->lock_depth = depth;
}
  1. __lock_kernel 函数
// lib/kernel_lock.c
static inline void __lock_kernel(void)
{
  do_raw_spin_lock(&kernel_flag);
}

9、顺序锁

(1)结构体

// include/linux/seqlock.h
typedef struct {
  unsigned sequence;
  spinlock_t lock;
} seqlock_t;

(2)方法说明

// include/linux/seqlock.h
/*
 * These macros triggered gcc-3.x compile-time problems.  We think these are
 * OK now.  Be cautious.
 */
#define __SEQLOCK_UNLOCKED(lockname) \
     { 0, __SPIN_LOCK_UNLOCKED(lockname) }

#define SEQLOCK_UNLOCKED \
     __SEQLOCK_UNLOCKED(old_style_seqlock_init)

#define seqlock_init(x)         \
  do {            \
    (x)->sequence = 0;      \
    spin_lock_init(&(x)->lock);   \
  } while (0)

#define DEFINE_SEQLOCK(x) \
    seqlock_t x = __SEQLOCK_UNLOCKED(x)

/* Lock out other writers and update the count.
 * Acts like a normal spin_lock/unlock.
 * Don't need preempt_disable() because that is in the spin_lock already.
 */
static inline void write_seqlock(seqlock_t *sl)
{
  spin_lock(&sl->lock);
  ++sl->sequence;
  smp_wmb();
}

static inline void write_sequnlock(seqlock_t *sl)
{
  smp_wmb();
  sl->sequence++;
  spin_unlock(&sl->lock);
}

static inline int write_tryseqlock(seqlock_t *sl)
{
  int ret = spin_trylock(&sl->lock);

  if (ret) {
    ++sl->sequence;
    smp_wmb();
  }
  return ret;
}

/* Start of read calculation -- fetch last complete writer token */
static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
{
  unsigned ret;

repeat:
  ret = sl->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }

  return ret;
}

/*
 * Test if reader processed invalid data.
 *
 * If sequence value changed then writer changed data while in section.
 */
static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
{
  smp_rmb();

  return (sl->sequence != start);
}


/*
 * Version using sequence counter only.
 * This can be used when code has its own mutex protecting the
 * updating starting before the write_seqcountbeqin() and ending
 * after the write_seqcount_end().
 */

typedef struct seqcount {
  unsigned sequence;
} seqcount_t;

#define SEQCNT_ZERO { 0 }
#define seqcount_init(x)  do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)

/* Start of read using pointer to a sequence counter only.  */
static inline unsigned read_seqcount_begin(const seqcount_t *s)
{
  unsigned ret;

repeat:
  ret = s->sequence;
  smp_rmb();
  if (unlikely(ret & 1)) {
    cpu_relax();
    goto repeat;
  }
  return ret;
}

/*
 * Test if reader processed invalid data because sequence number has changed.
 */
static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
{
  smp_rmb();

  return s->sequence != start;
}


/*
 * Sequence counter only version assumes that callers are using their
 * own mutexing.
 */
static inline void write_seqcount_begin(seqcount_t *s)
{
  s->sequence++;
  smp_wmb();
}

static inline void write_seqcount_end(seqcount_t *s)
{
  smp_wmb();
  s->sequence++;
}

/*
 * Possible sw/hw IRQ protected versions of the interfaces.
 */
#define write_seqlock_irqsave(lock, flags)        \
  do { local_irq_save(flags); write_seqlock(lock); } while (0)
#define write_seqlock_irq(lock)           \
  do { local_irq_disable();   write_seqlock(lock); } while (0)
#define write_seqlock_bh(lock)            \
        do { local_bh_disable();    write_seqlock(lock); } while (0)

#define write_sequnlock_irqrestore(lock, flags)       \
  do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
#define write_sequnlock_irq(lock)         \
  do { write_sequnlock(lock); local_irq_enable(); } while(0)
#define write_sequnlock_bh(lock)          \
  do { write_sequnlock(lock); local_bh_enable(); } while(0)

#define read_seqbegin_irqsave(lock, flags)        \
  ({ local_irq_save(flags);   read_seqbegin(lock); })

#define read_seqretry_irqrestore(lock, iv, flags)     \
  ({                \
    int ret = read_seqretry(lock, iv);      \
    local_irq_restore(flags);       \
    ret;              \
  })

10、禁止抢占

11、顺序与屏障

八、定时器和时间管理

1、节拍率 HZ

// include/linux/param.h

#ifndef _LINUX_PARAM_H
#define _LINUX_PARAM_H

#include <asm/param.h>

#endif

// arch/x86/include/asm/param.h
#include <asm-generic/param.h>

// include/asm-generic/param.h
#ifdef __KERNEL__
# define HZ   CONFIG_HZ /* Internal kernel timer frequency */
# define USER_HZ  100   /* some user interfaces are */
# define CLOCKS_PER_SEC (USER_HZ)       /* in "ticks" like times() */
#endif

#ifndef HZ
#define HZ 100
#endif

2、jiffies

(1)jiffies 的内部表示

// include/linux/jiffies.h
extern u64 __jiffy_data jiffies_64;
extern unsigned long volatile __jiffy_data jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
  return (u64)jiffies;
}
#endif


在 32 位体系结构上是 32 位,在 64 位体系结构上是 64 位。

(2)jiffies 的回绕

#define time_after(a,b)   \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(b) - (long)(a) < 0))
#define time_before(a,b)  time_after(b,a)

#define time_after_eq(a,b)  \
  (typecheck(unsigned long, a) && \
   typecheck(unsigned long, b) && \
   ((long)(a) - (long)(b) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)

/*
 * Calculate whether a is in the range of [b, c].
 */
#define time_in_range(a,b,c) \
  (time_after_eq(a,b) && \
   time_before_eq(a,c))

/*
 * Calculate whether a is in the range of [b, c).
 */
#define time_in_range_open(a,b,c) \
  (time_after_eq(a,b) && \
   time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b) \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(b) - (__s64)(a) < 0))
#define time_before64(a,b)  time_after64(b,a)

#define time_after_eq64(a,b)  \
  (typecheck(__u64, a) && \
   typecheck(__u64, b) && \
   ((__s64)(a) - (__s64)(b) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)

/*
 * These four macros compare jiffies and 'a' for convenience.
 */

/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)

/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)

/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)

/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)

/*
 * Have the 32 bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

(3)用户空间和 HZ

  在 2.6 版以前的内核中,如果改变内核中 HZ 的值,会给用户空间中某些程序造成异常结果。这是因为内核是以 节拍数 / 秒 的形式给用户空间导出这个值的,在这个接口稳定了很长一段时间后,应用程序便逐渐依赖于这个特定的 HZ 值了。所以如果在内核中更改了 HZ 的定义值,就打破了用户空间的常量关系——用户空间并不知道新的 HZ 值。所以用户空间可能认为系统运行时间已经是 20 个小时了,但实际上系统仅仅启动了两个小时。

  要像避免上面的错误,内核必须更改所有导出的 jiffies 值。因而内核定义了 USER_HZ 来代表用户空间看到的 HZ 值。在 x86 体系结构上,由于 HZ 值原来一直是 100,所以 USER_HZ 值就定义为 100。内核可以使用函数 jiffies_to_clock_t() 将一个由 HZ 表示的节拍数转换成一个由 USER_HZ 表示的节拍技术。

// kernel/time.c
clock_t jiffies_to_clock_t(long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
  return x * (USER_HZ / HZ);
# else
  return x / (HZ / USER_HZ);
# endif
#else
  return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

// include/linux/types.h
typedef __kernel_clock_t  clock_t;

// arch/x86/include/asm/posix_types_64.h
typedef long    __kernel_clock_t;

3、定时器

// include/linux/timer.h
struct timer_list {
  struct list_head entry;       /* 定时器链表的入口 */
  unsigned long expires;        /* 以 jiffies 为单位的定时值 */
  void (*function)(unsigned long);  /* 定时器处理函数 */
  unsigned long data;         /* 传给处理函数的长整型参数 */
  struct tvec_base *base;       /* 定时器内部值,用户不要使用 */
#ifdef CONFIG_TIMER_STATS
  void *start_site;
  char start_comm[16];
  int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
  struct lockdep_map lockdep_map;
#endif
};

4、延迟执行

(1)忙等待

unsigned long timeout = jiffies + 10;
while (time_before(jiffies, timeout))
  ;
unsigned long timeout = jiffies + 5 * HZ;
while (time_before(jiffies, timeout))
  cond_resched();

(2)短延迟

(a)方法说明
void udelay(unsigned long usecs);
void ndelay(unsigned long nsecs);
void mdelay(unsigned long msecs);


(b)mdelay 函数
  1. mdelay 函数
// include/linux/delay.h
#ifndef mdelay
#define mdelay(n) (\
  (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
  ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
  udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);

static inline void ssleep(unsigned int seconds)
{
  msleep(seconds * 1000);
}
  1. udelay 函数
// arch/x86/include/asm/delay.h
/* 0x10c7 is 2**32 / 1000000 (rounded up) */
#define udelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
  __udelay(n))

/* 0x5 is 2**32 / 1000000000 (rounded up) */
#define ndelay(n) (__builtin_constant_p(n) ? \
  ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
  __ndelay(n))
  1. __const_udelay 函数
// arch/x86/lib/delay.c
inline void __const_udelay(unsigned long xloops)
{
  int d0;

  xloops *= 4;
  asm("mull %%edx"
    :"=d" (xloops), "=&a" (d0)
    :"1" (xloops), "0"
    (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));

  __delay(++xloops);
}
EXPORT_SYMBOL(__const_udelay);

void __udelay(unsigned long usecs)
{
  __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);

void __ndelay(unsigned long nsecs)
{
  __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);

(3)schedule_timeout 函数

// kernel/timer.c
signed long __sched schedule_timeout(signed long timeout)
{
  struct timer_list timer;
  unsigned long expire;

  switch (timeout)
  {
  case MAX_SCHEDULE_TIMEOUT:
    /*
     * These two special cases are useful to be comfortable
     * in the caller. Nothing more. We could take
     * MAX_SCHEDULE_TIMEOUT from one of the negative value
     * but I' d like to return a valid offset (>=0) to allow
     * the caller to do everything it want with the retval.
     */
    schedule();
    goto out;
  default:
    /*
     * Another bit of PARANOID. Note that the retval will be
     * 0 since no piece of kernel is supposed to do a check
     * for a negative retval of schedule_timeout() (since it
     * should never happens anyway). You just have the printk()
     * that will tell you if something is gone wrong and where.
     */
    if (timeout < 0) {
      printk(KERN_ERR "schedule_timeout: wrong timeout "
        "value %lx\n", timeout);
      dump_stack();
      current->state = TASK_RUNNING;
      goto out;
    }
  }

  expire = timeout + jiffies;

  setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
  __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
  schedule();
  del_singleshot_timer_sync(&timer);

  /* Remove the timer from the object tracker */
  destroy_timer_on_stack(&timer);

  timeout = expire - jiffies;

 out:
  return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

九、内存管理

1、页

// include/linux/mm_types.h
struct page {
  unsigned long flags;    /* Atomic flags, some possibly
           * updated asynchronously */
  atomic_t _count;    /* Usage count, see below. */
  union {
    atomic_t _mapcount; /* Count of ptes mapped in mms,
           * to show when page is mapped
           * & limit reverse map searches.
           */
    struct {    /* SLUB */
      u16 inuse;
      u16 objects;
    };
  };
  union {
      struct {
    unsigned long private;    /* Mapping-private opaque data:
             * usually used for buffer_heads
             * if PagePrivate set; used for
             * swp_entry_t if PageSwapCache;
             * indicates order in the buddy
             * system if PG_buddy is set.
             */
    struct address_space *mapping;  /* If low bit clear, points to
             * inode address_space, or NULL.
             * If page mapped as anonymous
             * memory, low bit is set, and
             * it points to anon_vma object:
             * see PAGE_MAPPING_ANON below.
             */
      };
#if USE_SPLIT_PTLOCKS
      spinlock_t ptl;
#endif
      struct kmem_cache *slab;  /* SLUB: Pointer to slab */
      struct page *first_page;  /* Compound tail pages */
  };
  union {
    pgoff_t index;    /* Our offset within mapping. */
    void *freelist;   /* SLUB: freelist req. slab lock */
  };
  struct list_head lru;   /* Pageout list, eg. active_list
           * protected by zone->lru_lock !
           */
  /*
   * On machines where all RAM is mapped into kernel address space,
   * we can simply calculate the virtual address. On machines with
   * highmem some memory is mapped into kernel virtual memory
   * dynamically, so we need a place to store that address.
   * Note that this field could be 16 bits on x86 ... ;)
   *
   * Architectures with slow multiplication can define
   * WANT_PAGE_VIRTUAL in asm/page.h
   */
#if defined(WANT_PAGE_VIRTUAL)
  void *virtual;      /* Kernel virtual address (NULL if
             not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
  unsigned long debug_flags;  /* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
  /*
   * kmemcheck wants to track the status of each byte in a page; this
   * is a pointer to such a status block. NULL if not tracked.
   */
  void *shadow;
#endif
};

2、区

3、页操作

   函数定义在文件 include/linux/gfp.h

(1)获取页

(2)释放页

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)

4、kmalloc 函数

    kmalloc() 函数与用户空间的 malloc() 一族函数非常类似,只不过它多了一个 flags 参数。它可以获得以字节为单位的一块内核内存。

// include/linux/slab.h
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif

static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
  if (size != 0 && n > ULONG_MAX / size)
    return NULL;
  return __kmalloc(n * size, flags | __GFP_ZERO);
}

(1)gfp_t flags

  1. 行为修饰符

  2. 区修饰符
  3. 类型标志




(2)kmalloc 函数分析

  1. kmalloc 函数
// include/linux/slab_def.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
  struct kmem_cache *cachep;
  void *ret;

  if (__builtin_constant_p(size)) {
    int i = 0;

    if (!size)
      return ZERO_SIZE_PTR;

#define CACHE(x) \
    if (size <= x) \
      goto found; \
    else \
      i++;
#include <linux/kmalloc_sizes.h>
#undef CACHE
    return NULL;
found:
#ifdef CONFIG_ZONE_DMA
    if (flags & GFP_DMA)
      cachep = malloc_sizes[i].cs_dmacachep;
    else
#endif
      cachep = malloc_sizes[i].cs_cachep;

    ret = kmem_cache_alloc_notrace(cachep, flags);

    trace_kmalloc(_THIS_IP_, ret,
            size, slab_buffer_size(cachep), flags);

    return ret;
  }
  return __kmalloc(size, flags);
}
  1. __kmalloc 函数
// mm/slab.c
void *__kmalloc(size_t size, gfp_t flags)
{
  return __do_kmalloc(size, flags, NULL);
}
EXPORT_SYMBOL(__kmalloc);
  1. __do_kmalloc 函数
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
            void *caller)
{
  struct kmem_cache *cachep;
  void *ret;

  /* If you want to save a few bytes .text space: replace
   * __ with kmem_.
   * Then kmalloc uses the uninlined functions instead of the inline
   * functions.
   */
  cachep = __find_general_cachep(size, flags);
  if (unlikely(ZERO_OR_NULL_PTR(cachep)))
    return cachep;
  ret = __cache_alloc(cachep, flags, caller);

  trace_kmalloc((unsigned long) caller, ret,
          size, cachep->buffer_size, flags);

  return ret;
}
  1. __cache_alloc 函数

static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
  unsigned long save_flags;
  void *objp;

  flags &= gfp_allowed_mask;

  lockdep_trace_alloc(flags);

  if (slab_should_failslab(cachep, flags))
    return NULL;

  cache_alloc_debugcheck_before(cachep, flags);
  local_irq_save(save_flags);
  objp = __do_cache_alloc(cachep, flags);
  local_irq_restore(save_flags);
  objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
  kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
         flags);
  prefetchw(objp);

  if (likely(objp))
    kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

  if (unlikely((flags & __GFP_ZERO) && objp))
    memset(objp, 0, obj_size(cachep));

  return objp;
}

Linux 内核设计与实现3:https://developer.aliyun.com/article/1597350

目录
相关文章
|
1月前
|
缓存 Linux 开发者
Linux内核中的并发控制机制
本文深入探讨了Linux操作系统中用于管理多线程和进程的并发控制的关键技术,包括原子操作、锁机制、自旋锁、互斥量以及信号量。通过详细分析这些技术的原理和应用,旨在为读者提供一个关于如何有效利用Linux内核提供的并发控制工具以优化系统性能和稳定性的综合视角。
|
1月前
|
缓存 负载均衡 算法
深入探索Linux内核的调度机制
本文旨在揭示Linux操作系统核心的心脏——进程调度机制。我们将从Linux内核的架构出发,深入剖析其调度策略、算法以及它们如何共同作用于系统性能优化和资源管理。不同于常规摘要提供文章概览的方式,本摘要将直接带领读者进入Linux调度机制的世界,通过对其工作原理的解析,展现这一复杂系统的精妙设计与实现。
87 8
|
1月前
|
算法 Linux 调度
深入理解Linux内核调度器:从基础到优化####
本文旨在通过剖析Linux操作系统的心脏——内核调度器,为读者揭开其高效管理CPU资源的神秘面纱。不同于传统的摘要概述,本文将直接以一段精简代码片段作为引子,展示一个简化版的任务调度逻辑,随后逐步深入,详细探讨Linux内核调度器的工作原理、关键数据结构、调度算法演变以及性能调优策略,旨在为开发者与系统管理员提供一份实用的技术指南。 ####
76 4
|
21天前
|
算法 Linux
深入探索Linux内核的内存管理机制
本文旨在为读者提供对Linux操作系统内核中内存管理机制的深入理解。通过探讨Linux内核如何高效地分配、回收和优化内存资源,我们揭示了这一复杂系统背后的原理及其对系统性能的影响。不同于常规的摘要,本文将直接进入主题,不包含背景信息或研究目的等标准部分,而是专注于技术细节和实际操作。
|
21天前
|
存储 缓存 网络协议
Linux操作系统的内核优化与性能调优####
本文深入探讨了Linux操作系统内核的优化策略与性能调优方法,旨在为系统管理员和高级用户提供一套实用的指南。通过分析内核参数调整、文件系统选择、内存管理及网络配置等关键方面,本文揭示了如何有效提升Linux系统的稳定性和运行效率。不同于常规摘要仅概述内容的做法,本摘要直接指出文章的核心价值——提供具体可行的优化措施,助力读者实现系统性能的飞跃。 ####
|
22天前
|
监控 算法 Linux
Linux内核锁机制深度剖析与实践优化####
本文作为一篇技术性文章,深入探讨了Linux操作系统内核中锁机制的工作原理、类型及其在并发控制中的应用,旨在为开发者提供关于如何有效利用这些工具来提升系统性能和稳定性的见解。不同于常规摘要的概述性质,本文将直接通过具体案例分析,展示在不同场景下选择合适的锁策略对于解决竞争条件、死锁问题的重要性,以及如何根据实际需求调整锁的粒度以达到最佳效果,为读者呈现一份实用性强的实践指南。 ####
|
22天前
|
缓存 监控 网络协议
Linux操作系统的内核优化与实践####
本文旨在探讨Linux操作系统内核的优化策略与实际应用案例,深入分析内核参数调优、编译选项配置及实时性能监控的方法。通过具体实例讲解如何根据不同应用场景调整内核设置,以提升系统性能和稳定性,为系统管理员和技术爱好者提供实用的优化指南。 ####
|
24天前
|
负载均衡 算法 Linux
深入探索Linux内核调度机制:公平与效率的平衡####
本文旨在剖析Linux操作系统内核中的进程调度机制,特别是其如何通过CFS(完全公平调度器)算法实现多任务环境下资源分配的公平性与系统响应速度之间的微妙平衡。不同于传统摘要的概览性质,本文摘要将直接聚焦于CFS的核心原理、设计目标及面临的挑战,为读者揭开Linux高效调度的秘密。 ####
34 3
|
27天前
|
负载均衡 算法 Linux
深入探索Linux内核调度器:公平与效率的平衡####
本文通过剖析Linux内核调度器的工作机制,揭示了其在多任务处理环境中如何实现时间片轮转、优先级调整及完全公平调度算法(CFS),以达到既公平又高效地分配CPU资源的目标。通过对比FIFO和RR等传统调度策略,本文展示了Linux调度器如何在复杂的计算场景下优化性能,为系统设计师和开发者提供了宝贵的设计思路。 ####
40 6
|
26天前
|
消息中间件 安全 Linux
深入探索Linux操作系统的内核机制
本文旨在为读者提供一个关于Linux操作系统内核机制的全面解析。通过探讨Linux内核的设计哲学、核心组件、以及其如何高效地管理硬件资源和系统操作,本文揭示了Linux之所以成为众多开发者和组织首选操作系统的原因。不同于常规摘要,此处我们不涉及具体代码或技术细节,而是从宏观的角度审视Linux内核的架构和功能,为对Linux感兴趣的读者提供一个高层次的理解框架。