Linux 内核设计与实现1

简介: Linux 内核设计与实现

一、前言

    本章主要用来摘录《Linux 内核设计与实现》一书中学习知识点,其基于 Linux 2.6.34

二、进程管理

1、task_struct

// include/linux/sched.h

struct task_struct {
  volatile long state;  /* -1 unrunnable, 0 runnable, >0 stopped */
  void *stack;
  atomic_t usage;
  unsigned int flags; /* per process flags, defined below */
  unsigned int ptrace;

  int lock_depth;   /* BKL lock depth */

#ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
  int oncpu;
#endif
#endif

  int prio, static_prio, normal_prio;
  unsigned int rt_priority;
  const struct sched_class *sched_class;
  struct sched_entity se;
  struct sched_rt_entity rt;

#ifdef CONFIG_PREEMPT_NOTIFIERS
  /* list of struct preempt_notifier: */
  struct hlist_head preempt_notifiers;
#endif

  /*
   * fpu_counter contains the number of consecutive context switches
   * that the FPU is used. If this is over a threshold, the lazy fpu
   * saving becomes unlazy to save the trap. This is an unsigned char
   * so that after 256 times the counter wraps and the behavior turns
   * lazy again; this to deal with bursty apps that only use FPU for
   * a short time
   */
  unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
  unsigned int btrace_seq;
#endif

  unsigned int policy;
  cpumask_t cpus_allowed;

#ifdef CONFIG_TREE_PREEMPT_RCU
  int rcu_read_lock_nesting;
  char rcu_read_unlock_special;
  struct rcu_node *rcu_blocked_node;
  struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  struct sched_info sched_info;
#endif

  struct list_head tasks;
  struct plist_node pushable_tasks;

  struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
  struct task_rss_stat  rss_stat;
#endif
/* task state */
  int exit_state;
  int exit_code, exit_signal;
  int pdeath_signal;  /*  The signal sent when the parent dies  */
  /* ??? */
  unsigned int personality;
  unsigned did_exec:1;
  unsigned in_execve:1; /* Tell the LSMs that the process is doing an
         * execve */
  unsigned in_iowait:1;


  /* Revert to default priority/policy when forking */
  unsigned sched_reset_on_fork:1;

  pid_t pid;
  pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
  /* Canary value for the -fstack-protector gcc feature */
  unsigned long stack_canary;
#endif

  /* 
   * pointers to (original) parent process, youngest child, younger sibling,
   * older sibling, respectively.  (p->father can be replaced with 
   * p->real_parent->pid)
   */
  struct task_struct *real_parent; /* real parent process */
  struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
  /*
   * children/sibling forms the list of my natural children
   */
  struct list_head children;  /* list of my children */
  struct list_head sibling; /* linkage in my parent's children list */
  struct task_struct *group_leader; /* threadgroup leader */

  /*
   * ptraced is the list of tasks this task is using ptrace on.
   * This includes both natural children and PTRACE_ATTACH targets.
   * p->ptrace_entry is p's link on the p->parent->ptraced list.
   */
  struct list_head ptraced;
  struct list_head ptrace_entry;

  /*
   * This is the tracer handle for the ptrace BTS extension.
   * This field actually belongs to the ptracer task.
   */
  struct bts_context *bts;

  /* PID/PID hash table linkage. */
  struct pid_link pids[PIDTYPE_MAX];
  struct list_head thread_group;

  struct completion *vfork_done;    /* for vfork() */
  int __user *set_child_tid;    /* CLONE_CHILD_SETTID */
  int __user *clear_child_tid;    /* CLONE_CHILD_CLEARTID */

  cputime_t utime, stime, utimescaled, stimescaled;
  cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
  cputime_t prev_utime, prev_stime;
#endif
  unsigned long nvcsw, nivcsw; /* context switch counts */
  struct timespec start_time;     /* monotonic time */
  struct timespec real_start_time;  /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
  unsigned long min_flt, maj_flt;

  struct task_cputime cputime_expires;
  struct list_head cpu_timers[3];

/* process credentials */
  const struct cred *real_cred; /* objective and real subjective task
           * credentials (COW) */
  const struct cred *cred;  /* effective (overridable) subjective task
           * credentials (COW) */
  struct mutex cred_guard_mutex;  /* guard against foreign influences on
           * credential calculations
           * (notably. ptrace) */
  struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */

  char comm[TASK_COMM_LEN]; /* executable name excluding path
             - access with [gs]et_task_comm (which lock
               it with task_lock())
             - initialized normally by setup_new_exec */
/* file system info */
  int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
  struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
  unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
  struct thread_struct thread;
/* filesystem information */
  struct fs_struct *fs;
/* open file information */
  struct files_struct *files;
/* namespaces */
  struct nsproxy *nsproxy;
/* signal handlers */
  struct signal_struct *signal;
  struct sighand_struct *sighand;

  sigset_t blocked, real_blocked;
  sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
  struct sigpending pending;

  unsigned long sas_ss_sp;
  size_t sas_ss_size;
  int (*notifier)(void *priv);
  void *notifier_data;
  sigset_t *notifier_mask;
  struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
  uid_t loginuid;
  unsigned int sessionid;
#endif
  seccomp_t seccomp;

/* Thread group tracking */
    u32 parent_exec_id;
    u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
 * mempolicy */
  spinlock_t alloc_lock;

#ifdef CONFIG_GENERIC_HARDIRQS
  /* IRQ handler threads */
  struct irqaction *irqaction;
#endif

  /* Protection of the PI data structures: */
  raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
  /* PI waiters blocked on a rt_mutex held by this task */
  struct plist_head pi_waiters;
  /* Deadlock detection and priority inheritance handling */
  struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
  /* mutex deadlock detection */
  struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
  unsigned int irq_events;
  unsigned long hardirq_enable_ip;
  unsigned long hardirq_disable_ip;
  unsigned int hardirq_enable_event;
  unsigned int hardirq_disable_event;
  int hardirqs_enabled;
  int hardirq_context;
  unsigned long softirq_disable_ip;
  unsigned long softirq_enable_ip;
  unsigned int softirq_disable_event;
  unsigned int softirq_enable_event;
  int softirqs_enabled;
  int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
  u64 curr_chain_key;
  int lockdep_depth;
  unsigned int lockdep_recursion;
  struct held_lock held_locks[MAX_LOCK_DEPTH];
  gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
  void *journal_info;

/* stacked block device info */
  struct bio_list *bio_list;

/* VM state */
  struct reclaim_state *reclaim_state;

  struct backing_dev_info *backing_dev_info;

  struct io_context *io_context;

  unsigned long ptrace_message;
  siginfo_t *last_siginfo; /* For ptrace use.  */
  struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
  u64 acct_rss_mem1;  /* accumulated rss usage */
  u64 acct_vm_mem1; /* accumulated virtual memory usage */
  cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
  nodemask_t mems_allowed;  /* Protected by alloc_lock */
  int cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
  /* Control Group info protected by css_set_lock */
  struct css_set *cgroups;
  /* cg_list protected by css_set_lock and tsk->alloc_lock */
  struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
  struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
  struct compat_robust_list_head __user *compat_robust_list;
#endif
  struct list_head pi_state_list;
  struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
  struct perf_event_context *perf_event_ctxp;
  struct mutex perf_event_mutex;
  struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
  struct mempolicy *mempolicy;  /* Protected by alloc_lock */
  short il_next;
#endif
  atomic_t fs_excl; /* holding fs exclusive resources */
  struct rcu_head rcu;

  /*
   * cache last used pipe for splice
   */
  struct pipe_inode_info *splice_pipe;
#ifdef  CONFIG_TASK_DELAY_ACCT
  struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
  int make_it_fail;
#endif
  struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
  int latency_record_count;
  struct latency_record latency_record[LT_SAVECOUNT];
#endif
  /*
   * time slack values; these are used to round up poll() and
   * select() etc timeout values. These are in nanoseconds.
   */
  unsigned long timer_slack_ns;
  unsigned long default_timer_slack_ns;

  struct list_head  *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
  /* Index of current stored address in ret_stack */
  int curr_ret_stack;
  /* Stack of return addresses for return function tracing */
  struct ftrace_ret_stack *ret_stack;
  /* time stamp for last schedule */
  unsigned long long ftrace_timestamp;
  /*
   * Number of functions that haven't been traced
   * because of depth overrun.
   */
  atomic_t trace_overrun;
  /* Pause for the tracing */
  atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
  /* state flags for use by tracers */
  unsigned long trace;
  /* bitmask of trace recursion */
  unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
  struct memcg_batch_info {
    int do_batch; /* incremented when batch uncharge started */
    struct mem_cgroup *memcg; /* target memcg of uncharge */
    unsigned long bytes;    /* uncharged usage */
    unsigned long memsw_bytes; /* uncharged mem+swap usage */
  } memcg_batch;
#endif
};

    进程描述符 task_struct 包含了一个具体进程的所有信息。其相对较大,在 32 位机器上,它大约有 1.7KB

    Linux 通过 slab 分配器分配 task_struct 结构。在 2.6 以前的内核中,各个进程的

task_struct 存放在它们内核栈的尾端。这样做是为了让那些像 x86 那样寄存器较少的硬件体系结构只要通过栈指针就能计算出它的位置,而避免使用额外的寄存器专门记录。由于现在用 slab 分配器动态生成 task_struct ,所以只需要在栈底创建一个新的结构 struct thread_info

2、thread_info

// arch/x86/include/asm/thread_info.h

struct thread_info {
  struct task_struct  *task;    /* main task structure */
  struct exec_domain  *exec_domain; /* execution domain */
  __u32     flags;    /* low level flags */
  __u32     status;   /* thread synchronous flags */
  __u32     cpu;    /* current CPU */
  int     preempt_count;  /* 0 => preemptable,
               <0 => BUG */
  mm_segment_t    addr_limit;
  struct restart_block    restart_block;
  void __user   *sysenter_return;
#ifdef CONFIG_X86_32
  unsigned long           previous_esp;   /* ESP of the previous stack in
               case of nested (IRQ) stacks
            */
  __u8      supervisor_stack[0];
#endif
  int     uaccess_err;
};

三、调度

CFS 调度

四、系统调用

   用户空间的程序无法直接执行内核代码。它们不能直接调用内核空间中的函数,因为内核驻留在受保护的地址空间上。

   x86 系统通过 int $0x80 指令触发系统调用。系统调用号是通过 eax 寄存器传递给内核的。其响应函数 system_call() 函数通过将给定的系统调用号与 NR_syscalls 做比较来检查其有效性。如果它大于或者等于 NR_syscalls ,该函数就返回 -ENOSYS 。否则,就执行相应的系统调用:

call *sys_call_table(, %rax, 8)

   由于系统调用表中的表项是以 64 位(8字节)类型存放的,所以内核需要将给定的系统调用号乘以 8,然后用所得的成果在该表中查询其位置。

   除了系统调用号以外,大部分系统调用都还需要一些外部的参数输入。所以,在发生陷入的时候,应该把这些参数从用户空间传给内核。在 x86-32 系统上, ebxecxedxesiedi 按照顺序存放前五个参数。需要六个或六个以上参数则应该用一个单独的寄存器存放指向所有这些参数在用户空间地址的指针。

五、内核数据结构

1、kfifo

// kernel/kfifo.c

/**
 * 使用 buffer 内存创建并初始化kfifo队列
 *
 * @param fifo kfifo队列
 * @param buffer 指向内存地址
 * @param size 内存大小,必须是 2 的幂
 */
void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size);

/**
 * 动态创建并初始化kfifo队列
 *
 * @param fifo kfifo队列
 * @param size 创建kfifo队列大小
 * @param gfp_mask 标识
 * @return
 */
int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask);

/**
 * 把数据推入到队列。该函数把 from 指针所指的 len 字节数据拷贝到 fifo 所指的队列中,如果成功,
 * 则返回推入数据的字节大小。如果队列中的空闲字节小于 len,则该函数值最多可拷贝队列可用空间那么多
 * 的数据,这样的话,返回值可能小于 len。
 */
unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len);

/**
 * 该函数从 fifo 所指向的队列中拷贝出长度为 len 字节的数据到 to 所指的缓冲中。如果成功,
 * 该函数则返回拷贝的数据长度。如果队列中数据大小小于 len, 则该函数拷贝出的数据必然小于
 * 需要的数据大小
 */
unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len);

/**
 * 与 kfifo_out 类似,获取数据,但读后不删除数据。参数 offset 指向队列中的索引位置,如果
 * 该参数为 0,则读队列头,这时候和 kfifo_out 一样。
 */
unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, unsigned offset);

2、映射

// include/linux/idr.h

void *idr_find(struct idr *idp, int id);
int idr_pre_get(struct idr *idp, gfp_t gfp_mask);
int idr_get_new(struct idr *idp, void *ptr, int *id);
int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
int idr_for_each(struct idr *idp,
     int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *idp, int *nextid);
void *idr_replace(struct idr *idp, void *ptr, int id);
void idr_remove(struct idr *idp, int id);
void idr_remove_all(struct idr *idp);
void idr_destroy(struct idr *idp);
void idr_init(struct idr *idp);

3、二叉树

// include/linux/rbtree.h

struct rb_node
{
  unsigned long  rb_parent_color;
#define RB_RED    0
#define RB_BLACK  1
  struct rb_node *rb_right;
  struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
    /* The alignment might seem pointless, but allegedly CRIS needs it */

struct rb_root
{
  struct rb_node *rb_node;
};


#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
#define rb_color(r)   ((r)->rb_parent_color & 1)
#define rb_is_red(r)   (!rb_color(r))
#define rb_is_black(r) rb_color(r)
#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
  rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
}
static inline void rb_set_color(struct rb_node *rb, int color)
{
  rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
}

#define RB_ROOT (struct rb_root) { NULL, }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))

extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);

/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
          struct rb_root *root);

static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
        struct rb_node ** rb_link)
{
  node->rb_parent_color = (unsigned long )parent;
  node->rb_left = node->rb_right = NULL;

  *rb_link = node;
}

六、中断

1、软中断

// include/linux/interrupt.h

/**
 * struct irqaction - per interrupt action descriptor
 * @handler:  interrupt handler function
 * @flags:  flags (see IRQF_* above)
 * @name: name of the device
 * @dev_id: cookie to identify the device
 * @next: pointer to the next irqaction for shared interrupts
 * @irq:  interrupt number
 * @dir:  pointer to the proc/irq/NN/name entry
 * @thread_fn:  interupt handler function for threaded interrupts
 * @thread: thread pointer for threaded interrupts
 * @thread_flags: flags related to @thread
 */
struct irqaction {
  irq_handler_t handler;
  unsigned long flags;
  const char *name;
  void *dev_id;
  struct irqaction *next;
  int irq;
  struct proc_dir_entry *dir;
  irq_handler_t thread_fn;
  struct task_struct *thread;
  unsigned long thread_flags;
};

struct softirq_action
{
  void  (*action)(struct softirq_action *);
};


2、tasklet

// include/linux/interrupt.h

/* Tasklets --- multithreaded analogue of BHs.

   Main feature differing them of generic softirqs: tasklet
   is running only on one CPU simultaneously.

   Main feature differing them of BHs: different tasklets
   may be run simultaneously on different CPUs.

   Properties:
   * If tasklet_schedule() is called, then tasklet is guaranteed
     to be executed on some cpu at least once after this.
   * If the tasklet is already scheduled, but its excecution is still not
     started, it will be executed only once.
   * If this tasklet is already running on another CPU (or schedule is called
     from tasklet itself), it is rescheduled for later.
   * Tasklet is strictly serialized wrt itself, but not
     wrt another tasklets. If client needs some intertask synchronization,
     he makes it with spinlocks.
 */

struct tasklet_struct
{
  struct tasklet_struct *next;
  unsigned long state;
  atomic_t count;
  void (*func)(unsigned long);
  unsigned long data;
};

3、工作队列

// kernel/workqueue.c

struct cpu_workqueue_struct {

  spinlock_t lock;

  struct list_head worklist;
  wait_queue_head_t more_work;
  struct work_struct *current_work;

  struct workqueue_struct *wq;
  struct task_struct *thread;
} ____cacheline_aligned;

/*
 * The externally visible workqueue abstraction is an array of
 * per-CPU workqueues:
 */
struct workqueue_struct {
  struct cpu_workqueue_struct *cpu_wq;
  struct list_head list;
  const char *name;
  int singlethread;
  int freezeable;   /* Freeze threads during suspend */
  int rt;
#ifdef CONFIG_LOCKDEP
  struct lockdep_map lockdep_map;
#endif
};

4、下半部机制的选择

5、下半部禁止与使能

七、内核同步方法

1、原子操作

   在 x86 上,实现在 arch/x86/include/asm/atomic.h 文件中。

2、自旋锁

(1)结构体

// include/linux/spinlock_types.h
typedef struct spinlock {
  union {
    struct raw_spinlock rlock;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
    struct {
      u8 __padding[LOCK_PADSIZE];
      struct lockdep_map dep_map;
    };
#endif
  };
} spinlock_t;


typedef struct raw_spinlock {
  arch_spinlock_t raw_lock;
#ifdef CONFIG_GENERIC_LOCKBREAK
  unsigned int break_lock;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
  unsigned int magic, owner_cpu;
  void *owner;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map dep_map;
#endif
} raw_spinlock_t;

// arch/x86/include/asm/spinlock_types.h
typedef struct arch_spinlock {
  unsigned int slock;
} arch_spinlock_t;

(2)方法说明

   函数定义在文件 include/linux/spinlock.h

(3)raw_spin_lock 函数分析

  1. raw_spin_lock 函数
// include/linux/spinlock.h

#define raw_spin_lock(lock) _raw_spin_lock(lock)

static inline void spin_lock(spinlock_t *lock)
{
  raw_spin_lock(&lock->rlock);
}
  1. _raw_spin_lock 函数
// // kernel/spinlock.c

#ifndef CONFIG_INLINE_SPIN_LOCK
void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
  __raw_spin_lock(lock);
}
EXPORT_SYMBOL(_raw_spin_lock);
#endif
  1. __raw_spin_lock 函数
// include/linux/spinlock_api_smp.h

static inline void __raw_spin_lock(raw_spinlock_t *lock)
{
  preempt_disable();
  spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}
  1. do_raw_spin_lock 函数
// include/linux/spinlock.h

static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
  __acquire(lock);
  arch_spin_lock(&lock->raw_lock);
}
  1. arch_spin_lock 函数
// arch/x86/include/asm/spinlock.h

static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
{
  __ticket_spin_lock(lock);
}

static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
  short inc = 0x0100;

  asm volatile (
    LOCK_PREFIX "xaddw %w0, %1\n"
    "1:\t"
    "cmpb %h0, %b0\n\t"
    "je 2f\n\t"
    "rep ; nop\n\t"
    "movb %1, %b0\n\t"
    /* don't need lfence here, because loads are in-order */
    "jmp 1b\n"
    "2:"
    : "+Q" (inc), "+m" (lock->slock)
    :
    : "memory", "cc");
}

3、读写自旋锁

(1)结构体

// include/linux/rwlock_types.h
typedef struct {
  arch_rwlock_t raw_lock;
#ifdef CONFIG_GENERIC_LOCKBREAK
  unsigned int break_lock;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
  unsigned int magic, owner_cpu;
  void *owner;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map dep_map;
#endif
} rwlock_t;


// arch/x86/include/asm/spinlock_types.h
typedef struct {
  unsigned int lock;
} arch_rwlock_t;

(2)方法说明

   函数定义在文件 include/linux/rwlock.h

(3)read_lock函数分析

  1. read_lock 函数
// include/linux/rwlock.h

#define read_lock(lock)   _raw_read_lock(lock)
  1. _raw_read_lock 函数
// include/linux/rwlock_api_smp.h
#ifdef CONFIG_INLINE_READ_LOCK
#define _raw_read_lock(lock) __raw_read_lock(lock)
#endif

// include/linux/rwlock_api_smp.h
static inline void __raw_read_lock(rwlock_t *lock)
{
  preempt_disable();
  rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
  LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
}
  1. do_raw_read_lock 函数
// include/linux/rwlock.h
# define do_raw_read_lock(rwlock) do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
  1. arch_read_lock 函数
// arch/x86/include/asm/spinlock.h
static inline void arch_read_lock(arch_rwlock_t *rw)
{
  asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
         "jns 1f\n"
         "call __read_lock_failed\n\t"
         "1:\n"
         ::LOCK_PTR_REG (rw) : "memory");
}

4、信号量

(1)结构体

struct semaphore {
  spinlock_t    lock;
  unsigned int    count;
  struct list_head  wait_list;
};

(2)方法说明

   函数定义在文件 include/linux/semaphore.h 中,其允许进入睡眠。不能用在中断上下文,只能用在进程上下文。

(3)down_interruptible 函数分析

  1. down_interruptible 函数
// include/linux/semaphore.h
extern int __must_check down_interruptible(struct semaphore *sem);


// kernel/semaphore.c
/**
 * down_interruptible - acquire the semaphore unless interrupted
 * @sem: the semaphore to be acquired
 *
 * Attempts to acquire the semaphore.  If no more tasks are allowed to
 * acquire the semaphore, calling this function will put the task to sleep.
 * If the sleep is interrupted by a signal, this function will return -EINTR.
 * If the semaphore is successfully acquired, this function returns 0.
 */
int down_interruptible(struct semaphore *sem)
{
  unsigned long flags;
  int result = 0;

  spin_lock_irqsave(&sem->lock, flags);
  if (likely(sem->count > 0))
    sem->count--;
  else
    result = __down_interruptible(sem);
  spin_unlock_irqrestore(&sem->lock, flags);

  return result;
}
EXPORT_SYMBOL(down_interruptible);
  1. __down_interruptible 函数
// kernel/semaphore.c
static noinline int __sched __down_interruptible(struct semaphore *sem)
{
  return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
  1. __down_common 函数
// kernel/semaphore.c
/*
 * Because this function is inlined, the 'state' parameter will be
 * constant, and thus optimised away by the compiler.  Likewise the
 * 'timeout' parameter for the cases without timeouts.
 */
static inline int __sched __down_common(struct semaphore *sem, long state,
                long timeout)
{
  struct task_struct *task = current;
  struct semaphore_waiter waiter;

  list_add_tail(&waiter.list, &sem->wait_list);
  waiter.task = task;
  waiter.up = 0;

  for (;;) {
    if (signal_pending_state(state, task))
      goto interrupted;
    if (timeout <= 0)
      goto timed_out;
    __set_task_state(task, state);
    spin_unlock_irq(&sem->lock);
    timeout = schedule_timeout(timeout);
    spin_lock_irq(&sem->lock);
    if (waiter.up)
      return 0;
  }

 timed_out:
  list_del(&waiter.list);
  return -ETIME;

 interrupted:
  list_del(&waiter.list);
  return -EINTR;
}

5、读写信号量

(1)结构体

   rw_semaphore 结构体定义在各个体系结构下的 rwsem.h 文件中

 // arch/x86/include/asm/rwsem.h
struct rw_semaphore {
  rwsem_count_t   count;
  spinlock_t    wait_lock;
  struct list_head  wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
  struct lockdep_map dep_map;
#endif
};

(2)方法说明

   读写信号量函数定义

// include/linux/rwsem.h
/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

(3)down_read 函数分析

  1. down_read 函数
// kernel/rwsem.c
/*
 * lock for reading
 */
void __sched down_read(struct rw_semaphore *sem)
{
  might_sleep();
  rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

  LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
}

EXPORT_SYMBOL(down_read);
  1. __up_read 函数
 // arch/x86/include/asm/rwsem.h
/*
 * lock for reading
 */
static inline void __down_read(struct rw_semaphore *sem)
{
  asm volatile("# beginning down_read\n\t"
         LOCK_PREFIX _ASM_INC "(%1)\n\t"
         /* adds 0x00000001, returns the old value */
         "  jns        1f\n"
         "  call call_rwsem_down_read_failed\n"
         "1:\n\t"
         "# ending down_read\n\t"
         : "+m" (sem->count)
         : "a" (sem)
         : "memory", "cc");
}


Linux 内核设计与实现2:https://developer.aliyun.com/article/1597349

目录
相关文章
|
14天前
|
算法 Linux 调度
深入理解Linux内核调度器:从基础到优化####
本文旨在通过剖析Linux操作系统的心脏——内核调度器,为读者揭开其高效管理CPU资源的神秘面纱。不同于传统的摘要概述,本文将直接以一段精简代码片段作为引子,展示一个简化版的任务调度逻辑,随后逐步深入,详细探讨Linux内核调度器的工作原理、关键数据结构、调度算法演变以及性能调优策略,旨在为开发者与系统管理员提供一份实用的技术指南。 ####
52 4
|
18天前
|
缓存 算法 Linux
深入理解Linux内核调度器:公平性与性能的平衡####
真知灼见 本文将带你深入了解Linux操作系统的核心组件之一——完全公平调度器(CFS),通过剖析其设计原理、工作机制以及在实际系统中的应用效果,揭示它是如何在众多进程间实现资源分配的公平性与高效性的。不同于传统的摘要概述,本文旨在通过直观且富有洞察力的视角,让读者仿佛亲身体验到CFS在复杂系统环境中游刃有余地进行任务调度的过程。 ####
39 6
|
3天前
|
缓存 网络协议 Linux
深入探索Linux操作系统的内核优化策略####
本文旨在探讨Linux操作系统内核的优化方法,通过分析当前主流的几种内核优化技术,结合具体案例,阐述如何有效提升系统性能与稳定性。文章首先概述了Linux内核的基本结构,随后详细解析了内核优化的必要性及常用手段,包括编译优化、内核参数调整、内存管理优化等,最后通过实例展示了这些优化技巧在实际场景中的应用效果,为读者提供了一套实用的Linux内核优化指南。 ####
13 1
|
9天前
|
算法 Linux 开发者
Linux内核中的锁机制:保障并发控制的艺术####
本文深入探讨了Linux操作系统内核中实现的多种锁机制,包括自旋锁、互斥锁、读写锁等,旨在揭示这些同步原语如何高效地解决资源竞争问题,保证系统的稳定性和性能。通过分析不同锁机制的工作原理及应用场景,本文为开发者提供了在高并发环境下进行有效并发控制的实用指南。 ####
|
17天前
|
缓存 资源调度 安全
深入探索Linux操作系统的心脏——内核配置与优化####
本文作为一篇技术性深度解析文章,旨在引领读者踏上一场揭秘Linux内核配置与优化的奇妙之旅。不同于传统的摘要概述,本文将以实战为导向,直接跳入核心内容,探讨如何通过精细调整内核参数来提升系统性能、增强安全性及实现资源高效利用。从基础概念到高级技巧,逐步揭示那些隐藏在命令行背后的强大功能,为系统管理员和高级用户打开一扇通往极致性能与定制化体验的大门。 --- ###
46 9
|
16天前
|
缓存 负载均衡 Linux
深入理解Linux内核调度器
本文探讨了Linux操作系统核心组件之一——内核调度器的工作原理和设计哲学。不同于常规的技术文章,本摘要旨在提供一种全新的视角来审视Linux内核的调度机制,通过分析其对系统性能的影响以及在多核处理器环境下的表现,揭示调度器如何平衡公平性和效率。文章进一步讨论了完全公平调度器(CFS)的设计细节,包括它如何处理不同优先级的任务、如何进行负载均衡以及它是如何适应现代多核架构的挑战。此外,本文还简要概述了Linux调度器的未来发展方向,包括对实时任务支持的改进和对异构计算环境的适应性。
37 6
|
17天前
|
缓存 Linux 开发者
Linux内核中的并发控制机制:深入理解与应用####
【10月更文挑战第21天】 本文旨在为读者提供一个全面的指南,探讨Linux操作系统中用于实现多线程和进程间同步的关键技术——并发控制机制。通过剖析互斥锁、自旋锁、读写锁等核心概念及其在实际场景中的应用,本文将帮助开发者更好地理解和运用这些工具来构建高效且稳定的应用程序。 ####
35 5
|
17天前
|
算法 Unix Linux
深入理解Linux内核调度器:原理与优化
本文探讨了Linux操作系统的心脏——内核调度器(Scheduler)的工作原理,以及如何通过参数调整和代码优化来提高系统性能。不同于常规摘要仅概述内容,本摘要旨在激发读者对Linux内核调度机制深层次运作的兴趣,并简要介绍文章将覆盖的关键话题,如调度算法、实时性增强及节能策略等。
|
18天前
|
存储 监控 安全
Linux内核调优的艺术:从基础到高级###
本文深入探讨了Linux操作系统的心脏——内核的调优方法。文章首先概述了Linux内核的基本结构与工作原理,随后详细阐述了内核调优的重要性及基本原则。通过具体的参数调整示例(如sysctl、/proc/sys目录中的设置),文章展示了如何根据实际应用场景优化系统性能,包括提升CPU利用率、内存管理效率以及I/O性能等关键方面。最后,介绍了一些高级工具和技术,如perf、eBPF和SystemTap,用于更深层次的性能分析和问题定位。本文旨在为系统管理员和高级用户提供实用的内核调优策略,以最大化Linux系统的效率和稳定性。 ###
|
17天前
|
Java Linux Android开发
深入探索Android系统架构:从Linux内核到应用层
本文将带领读者深入了解Android操作系统的复杂架构,从其基于Linux的内核到丰富多彩的应用层。我们将探讨Android的各个关键组件,包括硬件抽象层(HAL)、运行时环境、以及核心库等,揭示它们如何协同工作以支持广泛的设备和应用。通过本文,您将对Android系统的工作原理有一个全面的认识,理解其如何平衡开放性与安全性,以及如何在多样化的设备上提供一致的用户体验。