Linux 内核设计与实现1:https://developer.aliyun.com/article/1597346
6、互斥体
(1)结构体
// include/linux/mutex.h /* * Simple, straightforward mutexes with strict semantics: * * - only one task can hold the mutex at a time * - only the owner can unlock the mutex * - multiple unlocks are not permitted * - recursive locking is not permitted * - a mutex object must be initialized via the API * - a mutex object must not be initialized via memset or copying * - task may not exit with mutex held * - memory areas where held locks reside must not be freed * - held mutexes must not be reinitialized * - mutexes may not be used in hardware or software interrupt * contexts such as tasklets and timers * * These semantics are fully enforced when DEBUG_MUTEXES is * enabled. Furthermore, besides enforcing the above rules, the mutex * debugging code also implements a number of additional features * that make lock debugging easier and faster: * * - uses symbolic names of mutexes, whenever they are printed in debug output * - point-of-acquire tracking, symbolic lookup of function names * - list of all locks held in the system, printout of them * - owner tracking * - detects self-recursing locks and prints out all relevant info * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) struct thread_info *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif };
(2)方法说明
函数定义在文件 include/linux/mutex.h 中
(3)mutex_lock 函数分析
- mutex_lock 函数
// include/linux/mutex.h extern void mutex_lock(struct mutex *lock); // kernel/mutex.c /*** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired * * Lock the mutex exclusively for this task. If the mutex is not * available right now, it will sleep until it can get it. * * The mutex must later on be released by the same task that * acquired it. Recursive locking is not allowed. The task * may not exit without first unlocking the mutex. Also, kernel * memory where the mutex resides mutex must not be freed with * the mutex still locked. The mutex must first be initialized * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. * * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging * checks that will enforce the restrictions and will also do * deadlock debugging. ) * * This function is similar to (but not equivalent to) down(). */ void __sched mutex_lock(struct mutex *lock) { might_sleep(); /* * The locking fastpath is the 1->0 transition from * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); mutex_set_owner(lock); } EXPORT_SYMBOL(mutex_lock);
(4)应用比较
7、完成变量
(1)结构体
// include/linux/completion.h /** * struct completion - structure used to maintain state for a "completion" * * This is the opaque structure used to maintain the state for a "completion". * Completions currently use a FIFO to queue threads that have to wait for * the "completion" event. * * See also: complete(), wait_for_completion() (and friends _timeout, * _interruptible, _interruptible_timeout, and _killable), init_completion(), * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and * INIT_COMPLETION(). */ struct completion { unsigned int done; wait_queue_head_t wait; }; // include/linux/wait.h struct __wait_queue_head { spinlock_t lock; struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t; struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; };
(2)方法说明
函数定义在文件 include/linux/completion.h 中
(3)complete 函数分析
- complete 函数
// kernel/sched.c void complete(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete);
- __wake_up_common 函数
// kernel/sched.c static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } }
8、BLK:大内核锁
(1)方法说明
函数定义在文件 include/linux/smp_lock.h 中
(2)lock_kernel 函数分析
- lock_kernel 函数
// include/linux/smp_lock.h #define lock_kernel() do { \ _lock_kernel(__func__, __FILE__, __LINE__); \ } while (0)
- _lock_kernel 函数
// lib/kernel_lock.c void __lockfunc _lock_kernel(const char *func, const char *file, int line) { int depth = current->lock_depth + 1; trace_lock_kernel(func, file, line); if (likely(!depth)) { might_sleep(); __lock_kernel(); } current->lock_depth = depth; }
- __lock_kernel 函数
// lib/kernel_lock.c static inline void __lock_kernel(void) { do_raw_spin_lock(&kernel_flag); }
9、顺序锁
(1)结构体
// include/linux/seqlock.h typedef struct { unsigned sequence; spinlock_t lock; } seqlock_t;
(2)方法说明
// include/linux/seqlock.h /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ #define __SEQLOCK_UNLOCKED(lockname) \ { 0, __SPIN_LOCK_UNLOCKED(lockname) } #define SEQLOCK_UNLOCKED \ __SEQLOCK_UNLOCKED(old_style_seqlock_init) #define seqlock_init(x) \ do { \ (x)->sequence = 0; \ spin_lock_init(&(x)->lock); \ } while (0) #define DEFINE_SEQLOCK(x) \ seqlock_t x = __SEQLOCK_UNLOCKED(x) /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); ++sl->sequence; smp_wmb(); } static inline void write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; spin_unlock(&sl->lock); } static inline int write_tryseqlock(seqlock_t *sl) { int ret = spin_trylock(&sl->lock); if (ret) { ++sl->sequence; smp_wmb(); } return ret; } /* Start of read calculation -- fetch last complete writer token */ static __always_inline unsigned read_seqbegin(const seqlock_t *sl) { unsigned ret; repeat: ret = sl->sequence; smp_rmb(); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; } return ret; } /* * Test if reader processed invalid data. * * If sequence value changed then writer changed data while in section. */ static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start) { smp_rmb(); return (sl->sequence != start); } /* * Version using sequence counter only. * This can be used when code has its own mutex protecting the * updating starting before the write_seqcountbeqin() and ending * after the write_seqcount_end(). */ typedef struct seqcount { unsigned sequence; } seqcount_t; #define SEQCNT_ZERO { 0 } #define seqcount_init(x) do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0) /* Start of read using pointer to a sequence counter only. */ static inline unsigned read_seqcount_begin(const seqcount_t *s) { unsigned ret; repeat: ret = s->sequence; smp_rmb(); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; } return ret; } /* * Test if reader processed invalid data because sequence number has changed. */ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); return s->sequence != start; } /* * Sequence counter only version assumes that callers are using their * own mutexing. */ static inline void write_seqcount_begin(seqcount_t *s) { s->sequence++; smp_wmb(); } static inline void write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; } /* * Possible sw/hw IRQ protected versions of the interfaces. */ #define write_seqlock_irqsave(lock, flags) \ do { local_irq_save(flags); write_seqlock(lock); } while (0) #define write_seqlock_irq(lock) \ do { local_irq_disable(); write_seqlock(lock); } while (0) #define write_seqlock_bh(lock) \ do { local_bh_disable(); write_seqlock(lock); } while (0) #define write_sequnlock_irqrestore(lock, flags) \ do { write_sequnlock(lock); local_irq_restore(flags); } while(0) #define write_sequnlock_irq(lock) \ do { write_sequnlock(lock); local_irq_enable(); } while(0) #define write_sequnlock_bh(lock) \ do { write_sequnlock(lock); local_bh_enable(); } while(0) #define read_seqbegin_irqsave(lock, flags) \ ({ local_irq_save(flags); read_seqbegin(lock); }) #define read_seqretry_irqrestore(lock, iv, flags) \ ({ \ int ret = read_seqretry(lock, iv); \ local_irq_restore(flags); \ ret; \ })
10、禁止抢占
11、顺序与屏障
八、定时器和时间管理
1、节拍率 HZ
// include/linux/param.h #ifndef _LINUX_PARAM_H #define _LINUX_PARAM_H #include <asm/param.h> #endif // arch/x86/include/asm/param.h #include <asm-generic/param.h> // include/asm-generic/param.h #ifdef __KERNEL__ # define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* some user interfaces are */ # define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ #endif #ifndef HZ #define HZ 100 #endif
2、jiffies
(1)jiffies 的内部表示
// include/linux/jiffies.h extern u64 __jiffy_data jiffies_64; extern unsigned long volatile __jiffy_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif
在 32 位体系结构上是 32 位,在 64 位体系结构上是 64 位。
(2)jiffies 的回绕
#define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)(b) - (long)(a) < 0)) #define time_before(a,b) time_after(b,a) #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) /* * Calculate whether a is in the range of [b, c]. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /* * Calculate whether a is in the range of [b, c). */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64() */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)(b) - (__s64)(a) < 0)) #define time_before64(a,b) time_after64(b,a) #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)(a) - (__s64)(b) >= 0)) #define time_before_eq64(a,b) time_after_eq64(b,a) /* * These four macros compare jiffies and 'a' for convenience. */ /* time_is_before_jiffies(a) return true if a is before jiffies */ #define time_is_before_jiffies(a) time_after(jiffies, a) /* time_is_after_jiffies(a) return true if a is after jiffies */ #define time_is_after_jiffies(a) time_before(jiffies, a) /* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
(3)用户空间和 HZ
在 2.6 版以前的内核中,如果改变内核中 HZ 的值,会给用户空间中某些程序造成异常结果。这是因为内核是以 节拍数 / 秒 的形式给用户空间导出这个值的,在这个接口稳定了很长一段时间后,应用程序便逐渐依赖于这个特定的 HZ 值了。所以如果在内核中更改了 HZ 的定义值,就打破了用户空间的常量关系——用户空间并不知道新的 HZ 值。所以用户空间可能认为系统运行时间已经是 20 个小时了,但实际上系统仅仅启动了两个小时。
要像避免上面的错误,内核必须更改所有导出的 jiffies 值。因而内核定义了 USER_HZ 来代表用户空间看到的 HZ 值。在 x86 体系结构上,由于 HZ 值原来一直是 100,所以 USER_HZ 值就定义为 100。内核可以使用函数 jiffies_to_clock_t() 将一个由 HZ 表示的节拍数转换成一个由 USER_HZ 表示的节拍技术。
// kernel/time.c clock_t jiffies_to_clock_t(long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ return x * (USER_HZ / HZ); # else return x / (HZ / USER_HZ); # endif #else return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); // include/linux/types.h typedef __kernel_clock_t clock_t; // arch/x86/include/asm/posix_types_64.h typedef long __kernel_clock_t;
3、定时器
// include/linux/timer.h struct timer_list { struct list_head entry; /* 定时器链表的入口 */ unsigned long expires; /* 以 jiffies 为单位的定时值 */ void (*function)(unsigned long); /* 定时器处理函数 */ unsigned long data; /* 传给处理函数的长整型参数 */ struct tvec_base *base; /* 定时器内部值,用户不要使用 */ #ifdef CONFIG_TIMER_STATS void *start_site; char start_comm[16]; int start_pid; #endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif };
4、延迟执行
(1)忙等待
unsigned long timeout = jiffies + 10; while (time_before(jiffies, timeout)) ;
unsigned long timeout = jiffies + 5 * HZ; while (time_before(jiffies, timeout)) cond_resched();
(2)短延迟
(a)方法说明
void udelay(unsigned long usecs); void ndelay(unsigned long nsecs); void mdelay(unsigned long msecs);
(b)mdelay 函数
- mdelay 函数
// include/linux/delay.h #ifndef mdelay #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) #endif #ifndef ndelay static inline void ndelay(unsigned long x) { udelay(DIV_ROUND_UP(x, 1000)); } #define ndelay(x) ndelay(x) #endif extern unsigned long lpj_fine; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); }
- udelay 函数
// arch/x86/include/asm/delay.h /* 0x10c7 is 2**32 / 1000000 (rounded up) */ #define udelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ __udelay(n)) /* 0x5 is 2**32 / 1000000000 (rounded up) */ #define ndelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n))
- __const_udelay 函数
// arch/x86/lib/delay.c inline void __const_udelay(unsigned long xloops) { int d0; xloops *= 4; asm("mull %%edx" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); __delay(++xloops); } EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } EXPORT_SYMBOL(__ndelay);
(3)schedule_timeout 函数
// kernel/timer.c signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); current->state = TASK_RUNNING; goto out; } } expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); schedule(); del_singleshot_timer_sync(&timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout);
九、内存管理
1、页
// include/linux/mm_types.h struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ atomic_t _count; /* Usage count, see below. */ union { atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ struct { /* SLUB */ u16 inuse; u16 objects; }; }; union { struct { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. */ struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ }; #if USE_SPLIT_PTLOCKS spinlock_t ptl; #endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* SLUB: freelist req. slab lock */ }; struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS unsigned long debug_flags; /* Use atomic bitops on this */ #endif #ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow; #endif };
2、区
3、页操作
函数定义在文件 include/linux/gfp.h 中
(1)获取页
(2)释放页
extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0)
4、kmalloc 函数
kmalloc() 函数与用户空间的 malloc() 一族函数非常类似,只不过它多了一个 flags 参数。它可以获得以字节为单位的一块内核内存。
// include/linux/slab.h #ifdef CONFIG_SLUB #include <linux/slub_def.h> #elif defined(CONFIG_SLOB) #include <linux/slob_def.h> #else #include <linux/slab_def.h> #endif static inline void *kcalloc(size_t n, size_t size, gfp_t flags) { if (size != 0 && n > ULONG_MAX / size) return NULL; return __kmalloc(n * size, flags | __GFP_ZERO); }
(1)gfp_t flags
- 行为修饰符
- 区修饰符
- 类型标志
(2)kmalloc 函数分析
- kmalloc 函数
// include/linux/slab_def.h static __always_inline void *kmalloc(size_t size, gfp_t flags) { struct kmem_cache *cachep; void *ret; if (__builtin_constant_p(size)) { int i = 0; if (!size) return ZERO_SIZE_PTR; #define CACHE(x) \ if (size <= x) \ goto found; \ else \ i++; #include <linux/kmalloc_sizes.h> #undef CACHE return NULL; found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) cachep = malloc_sizes[i].cs_dmacachep; else #endif cachep = malloc_sizes[i].cs_cachep; ret = kmem_cache_alloc_notrace(cachep, flags); trace_kmalloc(_THIS_IP_, ret, size, slab_buffer_size(cachep), flags); return ret; } return __kmalloc(size, flags); }
- __kmalloc 函数
// mm/slab.c void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, NULL); } EXPORT_SYMBOL(__kmalloc);
- __do_kmalloc 函数
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. * Then kmalloc uses the uninlined functions instead of the inline * functions. */ cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = __cache_alloc(cachep, flags, caller); trace_kmalloc((unsigned long) caller, ret, size, cachep->buffer_size, flags); return ret; }
- __cache_alloc 函数
static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; void *objp; flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); if (slab_should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, flags); prefetchw(objp); if (likely(objp)) kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); return objp; }
Linux 内核设计与实现3:https://developer.aliyun.com/article/1597350