为什么通过clear_refs可以使进程触发缺页?

简介: 为什么通过clear_refs可以使进程触发缺页?

平台

ARM64

Linux 6.10

作者

pengdonglin137@163.com

背景

最近在学习Linux的缺页异常时突然奇想,在不进行内存换出的情况下,如何让进程再次触发缺页?

基于对ARMv8的理解,它的MMU的页表项中有个AF位,当AF为0时,当访问到对应的虚拟页时,会触发缺页。

如果AF位为0,当访问到对应的虚拟页时,会触发MMU的Access flags fault。然后软件需要将这个AF位置1,之后再次访问时就不会触发这个异常了,而Linux中会使用下面的接口来清除和设置AF位:

// 清除
pmdp_test_and_clear_young
ptep_test_and_clear_young
// 设置
pte_mkyoung

ptep_test_and_clear_young为例:

static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
              unsigned long address,
              pte_t *ptep)
{
  pte_t pte = ptep_get(ptep);
  int r = 1;
  if (!pte_young(pte))
    r = 0;
  else
    set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
  return r;
}
#define pte_young(pte)    (!!(pte_val(pte) & PTE_AF))
static inline pte_t pte_mkold(pte_t pte)
{
  return clear_pte_bit(pte, __pgprot(PTE_AF));
}

这个接口用于清除PTE页表项的AF位,当再次访问时,会在缺页处理中设置AF位:

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
  pte_t entry;
  if (unlikely(pmd_none(*vmf->pmd))) {
    /*
     * Leave __pte_alloc() until later: because vm_ops->fault may
     * want to allocate huge page, and if we expose page table
     * for an instant, it will be difficult to retract from
     * concurrent faults and from rmap lookups.
     */
    vmf->pte = NULL;
    vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
  } else {
    /*
     * A regular pmd is established and it can't morph into a huge
     * pmd by anon khugepaged, since that takes mmap_lock in write
     * mode; but shmem or file collapse to THP could still morph
     * it into a huge pmd: just retry later if so.
     */
    vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
             vmf->address, &vmf->ptl);
    if (unlikely(!vmf->pte))
      return 0;
    vmf->orig_pte = ptep_get_lockless(vmf->pte);
    vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
    if (pte_none(vmf->orig_pte)) {
      pte_unmap(vmf->pte);
      vmf->pte = NULL;
    }
  }
  // 如果还没有映射物理页,其中在填充页表的时候会设置AF位,可以参考vm_get_page_prot
  if (!vmf->pte)
    return do_pte_missing(vmf);
  // 如果已经被交换出去
  if (!pte_present(vmf->orig_pte))
    return do_swap_page(vmf);
  // 用于执行NUMA平衡,实现内存迁移。它会周期地把部分虚拟页对应PTE设置位PROT_NONE,读和写都会触发异常
  // 然后在处理缺页的时候处理内存迁移
  if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
    return do_numa_page(vmf);
  spin_lock(vmf->ptl);
  entry = vmf->orig_pte;
  // 通过其他路径已经设置了页表项
  if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
    update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
    goto unlock;
  }
  if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
    if (!pte_write(entry)) // 写时复制
      return do_wp_page(vmf);
    else if (likely(vmf->flags & FAULT_FLAG_WRITE))
      entry = pte_mkdirty(entry);
  }
  // 对于AF位触发的缺页,上面的条件不会满足,会走这里,设置AF位
  entry = pte_mkyoung(entry);
  if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
        vmf->flags & FAULT_FLAG_WRITE)) {
    update_mmu_cache_range(vmf, vmf->vma, vmf->address,
        vmf->pte, 1);
  } else {
    /* Skip spurious TLB flush for retried page fault */
    if (vmf->flags & FAULT_FLAG_TRIED)
      goto unlock;
    /*
     * This is needed only for protection faults but the arch code
     * is not yet telling us if this is a protection fault or not.
     * This still avoids useless tlb flushes for .text page faults
     * with threads.
     */
    if (vmf->flags & FAULT_FLAG_WRITE)
      flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                 vmf->pte);
  }
unlock:
  pte_unmap_unlock(vmf->pte, vmf->ptl);
  return 0;
}

而clear_refs的实现就利用了这一点,这里是关于这个节点的用法:/proc/pid/clear_refs

# 清除进程所有虚拟区域的 Access/PG_reference
# DEFINE: CLEAR_REFS_ALL 1
echo 1 > /proc/PID/clear_refs
# 清除进程所有匿名映射区域的 Access/PG_reference
# DEFINE: CLEAR_REFS_ANON 2
echo 2 > /proc/PID/clear_refs
# 清除进程所有文件映射区域的 Access/PG_reference
# DEFINE: CLEAR_REFS_MAPPED 3
echo 3 > /proc/PID/clear_refs
# 清除进程所有软脏页标志
# DEFINE: CLEAR_REFS_SOFT_DIRTY 4
echo 4 > /proc/PID/clear_refs
# 重置进程的 Hiwater_rss
# DEFINE: CLEAR_REFS_MM_HIWATER_RSS 5
echo 5 > /proc/PID/clear_refs

实现

当向clear_refs写入数值时,函数clear_refs_write被回调,这个函数中会调用:

walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);

其中walk_page_range负责遍历页表,在遍历的过程中会回调clear_refs_walk_ops中的函数:

static const struct mm_walk_ops clear_refs_walk_ops = {
  .pmd_entry    = clear_refs_pte_range,
  .test_walk    = clear_refs_test_walk,
  .walk_lock    = PGWALK_WRLOCK,
};
  • test_walk回调:用于判断是否跳过当前vma,返回0表示需要遍历当前vma,返回-1表示结束遍历,返回1表示跳过当前vma
  • pmd_entry回调:处理一个非空的PMD entry

先看一下如何判断是否遍历当前vma的实现:

static int clear_refs_test_walk(unsigned long start, unsigned long end,
        struct mm_walk *walk)
{
  struct clear_refs_private *cp = walk->private;
  struct vm_area_struct *vma = walk->vma;
  // 不是通过struct page来映射的
  if (vma->vm_flags & VM_PFNMAP)
    return 1;
  /*
   * Writing 1 to /proc/pid/clear_refs affects all pages.
   * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
   * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
   * Writing 4 to /proc/pid/clear_refs affects all pages.
   */
  // 如果要清除的是匿名页,但是当前vma映射到的是文件,那么跳过当前vma
  if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
    return 1;
  // 如果要清除的是文件页,但是当前vma是匿名的,那么跳过当前vma
  if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
    return 1;
  // 处理当前vma
  return 0;
}

接下来看看如何清除页表项的AF位:

static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        unsigned long end, struct mm_walk *walk)
{
  struct clear_refs_private *cp = walk->private;
  struct vm_area_struct *vma = walk->vma;
  pte_t *pte, ptent;
  spinlock_t *ptl;
  struct folio *folio;
  ptl = pmd_trans_huge_lock(pmd, vma);
  if (ptl) {  // 如果是PMD映射的巨型页
    if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
      clear_soft_dirty_pmd(vma, addr, pmd);
      goto out;
    }
    if (!pmd_present(*pmd))  // 如果被swap出去了,跳过
      goto out;
    folio = pmd_folio(*pmd);
    /* Clear accessed and referenced bits. */
    pmdp_test_and_clear_young(vma, addr, pmd);  // 清除PMD页表项的AF位
    folio_test_clear_young(folio);
    folio_clear_referenced(folio);
out:
    spin_unlock(ptl);
    return 0;
  }
  pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  if (!pte) {
    walk->action = ACTION_AGAIN;
    return 0;
  }
  for (; addr != end; pte++, addr += PAGE_SIZE) {
    ptent = ptep_get(pte);
    if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
      clear_soft_dirty(vma, addr, pte);
      continue;
    }
    if (!pte_present(ptent))  // 如果被swap出去了,跳过
      continue;
    folio = vm_normal_folio(vma, addr, ptent);
    if (!folio)
      continue;
    /* Clear accessed and referenced bits. */
    ptep_test_and_clear_young(vma, addr, pte);  // 清除PTE页表项的AF位
    folio_test_clear_young(folio);
    folio_clear_referenced(folio);
  }
  pte_unmap_unlock(pte - 1, ptl);
  cond_resched();
  return 0;
}

实验

下面通过实验来观察和学习:

思路是:

  • 进程通过malloc申请一块匿名内存,然后通过memset或者mlock等接口事先分配好物理页。接着反复去访问这段内存
  • 通过向clear_refs写入2来清除匿名页的AF位
  • 通过各种工具来观察缺页

测试程序

leak2.c

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mman.h>
char *addr;
// 16MB
int size = 0x1000*0x1000;
int func3(void)
{
static int i = 0;
char *access;
int ret = 0;
printf("%s enter.\n", __func__);
        access = addr + 0x1000*i;
printf("p: %d, s: %p, e: %p, %s access addr: %p\n",
                getpid(), addr, addr + size,
                i&0x1 ? "write" : "read",
                access);
if (i & 0x1)
                *access = 0x5a;  // 触发写访问缺页
else
                ret = *access;   // 触发读访问缺页
        sleep(1);
        i++;
if (i >= 0x1000)
                i = 0;
return ret;
}
int func2(void)
{
printf("%s enter.\n", __func__);
return func3();
}
int  func1(void)
{
printf("%s enter.\n", __func__);
return func2();
}
int main(void)
{
int ret;
printf("%s enter.\n", __func__);
        addr = malloc(size);
if (!addr) {
printf("alloc buf failed\n");
return -1;
        }
/*
          为了测试方便,使更容易观察到缺页,不使用THP,即不使用透明巨型页映射。
          需要注意的是,不能把将THP的策略配置为always,否则总是会按照2MB的巨型页去映射
          root@arm64:/sys/kernel/mm/transparent_hugepage# cat enabled
          always [madvise] never
         */
        ret = madvise((void *)((unsigned long)addr & ~(0x1000 - 1)), size, MADV_NOHUGEPAGE);
if (ret < 0) {
                perror("set nohugepage failed");
return -1;
        }
// 这个区域如果发生缺页的话,一次只映射一个page,由于下面用了mlockall,这步可以不做
        ret = madvise((void *)((unsigned long)addr & ~(0x1000 - 1)), size, MADV_RANDOM);
if (ret < 0) {
                perror("set random failed\n");
return -1;
        }
// 预先给这片区域映射物理页
// memset(addr, 0, size);
if (mlockall(MCL_CURRENT | MCL_FUTURE) < 0) {
                perror("mlockall failed");
return -1;
        }
while (1)
                func1();
return 0;
}

开始运行后,可以看到如下日志:

func1 enter.
func2 enter.
func3 enter.
p: 2058, s: 0xffff9a600010, e: 0xffff9b600010,  write access addr: 0xffff9a611010
func1 enter.
func2 enter.
func3 enter.
p: 2058, s: 0xffff9a600010, e: 0xffff9b600010,   read access addr: 0xffff9a612010

查看映射

root@arm64:/sys/kernel/mm/transparent_hugepage# pmap -x `pidof leak2`
2058:   ./leak2
Address           Kbytes     RSS   Dirty Mode  Mapping
0000aaaab9690000       4       4       0 r-x-- leak2
0000aaaab96a1000       4       4       4 r---- leak2
0000aaaab96a2000       4       4       4 rw--- leak2
0000aaaae9238000     132     132     132 rw---   [ anon ]
> 0000ffff9a600000   16384   16384   16384 rw---   [ anon ]
0000ffff9b600000       4       4       4 rw---   [ anon ]
0000ffff9b796000    1388    1388       0 r-x-- libc-2.31.so
0000ffff9b8f1000      60       0       0 ----- libc-2.31.so
0000ffff9b900000      16      16      16 r---- libc-2.31.so
0000ffff9b904000       8       8       8 rw--- libc-2.31.so
0000ffff9b906000      12      12      12 rw---   [ anon ]
0000ffff9b909000     132     132       0 r-x-- ld-2.31.so
0000ffff9b92b000       8       8       8 rw---   [ anon ]
0000ffff9b937000       8       0       0 r----   [ anon ]
0000ffff9b939000       4       4       0 r-x--   [ anon ]
0000ffff9b93a000       4       4       4 r---- ld-2.31.so
0000ffff9b93b000       8       8       8 rw--- ld-2.31.so
0000ffffd0312000     132     132     132 rw---   [ stack ]
---------------- ------- ------- -------
total kB           18312   18244   16716

使用crash的vtop命令确认一下是否为按4KB的物理页映射的:

crash> vtop ffff9a600000
VIRTUAL     PHYSICAL
ffff9a600000  138ae7000
PAGE DIRECTORY: ffff0000d719b000
   PGD: ffff0000d719bff8 => 800000116e70003
   PUD: ffff0000d6e70ff0 => 800000116f6d003
   PMD: ffff0000d6f6d698 => 800000116b4d003
   PTE: ffff0000d6b4d000 => e8000138ae7f43
  PAGE: 138ae7000
     PTE        PHYSICAL   FLAGS
e8000138ae7f43  138ae7000  (VALID|USER|SHARED|AF|NG|PXN|UXN|DIRTY)
      VMA           START       END     FLAGS FILE
ffff0000d739d768 ffff9a600000 ffff9b600000 40112073
      PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
fffffdffc3e2b9c0 138ae7000 ffff0000cc765cc9 ffff9a600  1 bfffe00001d0028 uptodate,lru,mappedtodisk,swapbacked,unevictable,mlocked

上面PTE这行就是虚拟地址ffff9a600000对用的PTE页表项的内容的解析。

上面指示的区域就是malloc申请的16MB的匿名页内存区域,RSS大小也是16MB,意味着这块虚拟内存已经全部映射到了物理页。

缺页次数

top - 14:59:55 up 28 min,  4 users,  load average: 0.08, 0.33, 0.50
Tasks:   1 total,   0 running,   1 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.0 us,  2.3 sy,  0.0 ni, 97.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   3658.2 total,   2607.2 free,    612.3 used,    438.7 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   2904.3 avail Mem
nMaj nMin     PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
   0 4271    2058 pengdl    20   0   18312  18100   1460 S   0.0   0.5   0:01.67 leak2

通过top命令统计leak2进程的发生的缺页次数,主要是nMin(次缺页)。

正常情况下,nMin是恒定的,当执行下面的命令后:

echo 2 > /proc/`pidof leak2`/clear_refs

然后可以看到nMin会每秒增加1。用pidstat也可以佐证:

root@arm64:~# pidstat -r 1 -p `pidof leak2`
Linux 6.10.0+ (arm64)   08/23/24        _aarch64_       (4 CPU)
15:16:33      UID       PID  minflt/s  majflt/s     VSZ     RSS   %MEM  Command
15:16:34     1000      2058      0.99      0.00   18312   18100   0.48  leak2
15:16:35     1000      2058      1.00      0.00   18312   18100   0.48  leak2
15:16:36     1000      2058      1.00      0.00   18312   18100   0.48  leak2
15:16:37     1000      2058      0.99      0.00   18312   18100   0.48  leak2
15:16:38     1000      2058      1.00      0.00   18312   18100   0.48  leak2
15:16:39     1000      2058      1.00      0.00   18312   18100   0.48  leak2
15:16:40     1000      2058      1.00      0.00   18312   18100   0.48  leak2

内核是如何统计nMaj和nMin的呢?可以参考mm_account_fault。nMaj表示在处理缺页的时候需要从后备存储(如文件、swap设备、块设备等)读取数据到page,然后进行映射。而nMin表示数据已经在内存里了,只需要修改一下页表映射,相比之下nMin的开销要比nMaj小很多。

上面写完clear_refs,可以用crash再次查看一下第一个虚拟页的PTE映射属性:

VIRTUAL     PHYSICAL
ffff9a600000  138ae7000
PAGE DIRECTORY: ffff0000d719b000
   PGD: ffff0000d719bff8 => 800000116e70003
   PUD: ffff0000d6e70ff0 => 800000116f6d003
   PMD: ffff0000d6f6d698 => 800000116b4d003
   PTE: ffff0000d6b4d000 => e8000138ae7b43
  PAGE: 138ae7000
     PTE        PHYSICAL   FLAGS
e8000138ae7b43  138ae7000  (VALID|USER|SHARED|NG|PXN|UXN|DIRTY)
      VMA           START       END     FLAGS FILE
ffff0000d739d768 ffff9a600000 ffff9b600000 40112073
      PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
fffffdffc3e2b9c0 138ae7000 ffff0000cc765cc9 ffff9a600  1 bfffe00001d0028 uptodate,lru,mappedtodisk,swapbacked,unevictable,mlocked

可以看到,AF位已经已经清除了。

使用perf观察缺页,并且记录调用栈

perf支持缺页事件:

# perf list
...
  major-faults                                       [Software event]
  minor-faults                                       [Software event]
  page-faults OR faults                              [Software event]
...

可以参考内核代码,其实上面两个事件也是在mm_account_fault中进行记录的:

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
            unsigned long address, unsigned int flags,
            vm_fault_t ret)
{
  bool major;
  /* Incomplete faults will be accounted upon completion. */
  if (ret & VM_FAULT_RETRY)
    return;
  /*
   * To preserve the behavior of older kernels, PGFAULT counters record
   * both successful and failed faults, as opposed to perf counters,
   * which ignore failed cases.
   */
  count_vm_event(PGFAULT);
  count_memcg_event_mm(mm, PGFAULT);
  /*
   * Do not account for unsuccessful faults (e.g. when the address wasn't
   * valid).  That includes arch_vma_access_permitted() failing before
   * reaching here. So this is not a "this many hardware page faults"
   * counter.  We should use the hw profiling for that.
   */
  if (ret & VM_FAULT_ERROR)
    return;
  /*
   * We define the fault as a major fault when the final successful fault
   * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
   * handle it immediately previously).
   */
  major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
  if (major)
    current->maj_flt++;
  else
    current->min_flt++;
  /*
   * If the fault is done for GUP, regs will be NULL.  We only do the
   * accounting for the per thread fault counters who triggered the
   * fault, and we skip the perf event updates.
   */
  if (!regs)
    return;
  if (major)
    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
  else
    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

执行如下命令:

# perf record -e minor-faults -g -p `pidof leak2`
# perf script
root@arm64:~# perf script
leak2    2058  4140.293988:          1 minor-faults:
            aaaab9690a60 func3+0xe4 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690ae4 func2+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690b0c func1+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690c54 main+0x140 (/home/pengdl/demo/kmemleak/leak2)
            ffff9b7b6e10 __libc_start_main+0xe8 (/usr/lib/aarch64-linux-gnu/libc-2.31.so)
            aaaab96908a4 _start+0x34 (/home/pengdl/demo/kmemleak/leak2)
leak2    2058  4141.307048:          1 minor-faults:
            aaaab9690a6c func3+0xf0 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690ae4 func2+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690b0c func1+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690c54 main+0x140 (/home/pengdl/demo/kmemleak/leak2)
            ffff9b7b6e10 __libc_start_main+0xe8 (/usr/lib/aarch64-linux-gnu/libc-2.31.so)
            aaaab96908a4 _start+0x34 (/home/pengdl/demo/kmemleak/leak2)
...

使用mem_abort事件

trace_event

内核导出了下面的trace point:

root@arm64:/sys/kernel/debug/tracing/events/exceptions# ls -l
total 0
-rw-r----- 1 root root 0 Aug 23 15:43 enable
-rw-r----- 1 root root 0 Aug 23 15:43 filter
drwxr-xr-x 1 root root 0 Aug 23 15:36 mem_abort_kernel
drwxr-xr-x 1 root root 0 Aug 23 15:36 mem_abort_user

从名字可以看到,当进程在用户态触发了mem abort,那么会触发mem_abort_user事件:

root@arm64:/sys/kernel/debug/tracing/events/exceptions/mem_abort_user# cat format
name: mem_abort_user
ID: 32
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;
        field:unsigned long address;    offset:8;       size:8; signed:0;
        field:unsigned long ip; offset:16;      size:8; signed:0;
        field:unsigned long error_code; offset:24;      size:8; signed:0;
print fmt: "address=%ps ip=%ps error_code=0x%lx", (void *)REC->address, (void *)REC->ip, REC->error_code

可以使用这个事件进行测试:

# cd /sys/kernel/debug/tracing/events/exceptions/mem_abort_user
# echo 'comm ~ "leak2"' > filter
# echo 1 > enable
# echo 1 > /sys/kernel/tracing/tracing_on

可以看到如下日志:

root@arm64:/sys/kernel/debug/tracing/events/exceptions/mem_abort_user# cat /sys/kernel/tracing/trace_pipe
           leak2-2058    [001] .....  5312.321453: mem_abort_user: address=0xffff9a71e010 ip=0xaaaab9690a6c error_code=0x9200000b
           leak2-2058    [001] .....  5313.327391: mem_abort_user: address=0xffff9a71f010 ip=0xaaaab9690a60 error_code=0x9200004b
           leak2-2058    [001] .....  5314.331440: mem_abort_user: address=0xffff9a720010 ip=0xaaaab9690a6c error_code=0x9200000b
           leak2-2058    [001] .....  5315.337957: mem_abort_user: address=0xffff9a721010 ip=0xaaaab9690a60 error_code=0x9200004b

此外,也可以对trace event进行配置,当记录事件的时候把内核栈和用户栈也一并记录下来:

# cd /sys/kernel/debug/tracing/options
# echo 1 > userstacktrace
# echo 1 > stacktrace
# echo 1 > sym-userobj

此时看到的日志如下:

leak2-2058    [001] .....  5537.394169: mem_abort_user: address=0xffff9a7fe010 ip=0xaaaab9690a6c error_code=0x9200000b
           leak2-2058    [001] .....  5537.394650: <stack trace>
 => do_mem_abort
 => el0_da
 => el0t_64_sync_handler
 => el0t_64_sync
           leak2-2058    [001] .....  5537.394672: <user stack trace>
 => /home/pengdl/demo/kmemleak/leak2[+0xa6c]
 => /home/pengdl/demo/kmemleak/leak2[+0xae4]
 => /home/pengdl/demo/kmemleak/leak2[+0xb0c]
 => /home/pengdl/demo/kmemleak/leak2[+0xc54]
 => /usr/lib/aarch64-linux-gnu/libc-2.31.so[+0x20e10]
 => /home/pengdl/demo/kmemleak/leak2[+0x8a4]
           leak2-2058    [001] .....  5538.399253: mem_abort_user: address=0xffff9a7ff010 ip=0xaaaab9690a60 error_code=0x9200004b
           leak2-2058    [001] .....  5538.401479: <stack trace>
 => do_mem_abort
 => el0_da
 => el0t_64_sync_handler
 => el0t_64_sync
           leak2-2058    [001] .....  5538.401545: <user stack trace>
 => /home/pengdl/demo/kmemleak/leak2[+0xa60]
 => /home/pengdl/demo/kmemleak/leak2[+0xae4]
 => /home/pengdl/demo/kmemleak/leak2[+0xb0c]
 => /home/pengdl/demo/kmemleak/leak2[+0xc54]
 => /usr/lib/aarch64-linux-gnu/libc-2.31.so[+0x20e10]
 => /home/pengdl/demo/kmemleak/leak2[+0x8a4]

使用perf

内核导出了mem_abort事件:

root@arm64:~# perf list | grep mem_abort
  exceptions:mem_abort_kernel                        [Tracepoint event]
  exceptions:mem_abort_user                          [Tracepoint event]

然后使用下面的命令记录:

# perf record -e exceptions:mem_abort_user -g -p `pidof leak2`

解析抓到的数据:

root@arm64:~# perf script
leak2    2058 [000]  4855.336563: exceptions:mem_abort_user: address=0xffff9b557010 ip=0xaaaab9690a60 error_code=0x9200004b
        ffff80008002aae0 do_mem_abort+0xc8 ([kernel.kallsyms])
        ffff80008002aae0 do_mem_abort+0xc8 ([kernel.kallsyms])
        ffff800080c73380 el0_da+0x38 ([kernel.kallsyms])
        ffff800080c74504 el0t_64_sync_handler+0xe4 ([kernel.kallsyms])
        ffff80008001150c el0t_64_sync+0x14c ([kernel.kallsyms])
            aaaab9690a60 func3+0xe4 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690ae4 func2+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690b0c func1+0x20 (/home/pengdl/demo/kmemleak/leak2)
            aaaab9690c54 main+0x140 (/home/pengdl/demo/kmemleak/leak2)
            ffff9b7b6e10 __libc_start_main+0xe8 (/usr/lib/aarch64-linux-gnu/libc-2.31.so)
            aaaab96908a4 _start+0x34 (/home/pengdl/demo/kmemleak/leak2)

此外,因为是基于trace point,所以还可以对数据进行筛选和过滤,比如:

root@arm64:~# perf record -e exceptions:mem_abort_user -g --filter 'address <= 0xffff9b600010 && address >= 0xffff9a600010 && common_pid == 2058 && comm ~ "leak2"'


相关文章
|
前端开发 API
Electron 渲染进程之间互相通信 创建窗口时触发
Electron 渲染进程之间互相通信 创建窗口时触发
|
前端开发
Electron 渲染进程与渲染进程之间的实时通信 (实时触发及接收消息)
Electron 渲染进程与渲染进程之间的实时通信 (实时触发及接收消息)
龙芯2K驱动开发——使用中断触发读取GPIO电平值上传给读取进程
龙芯2K驱动开发——使用中断触发读取GPIO电平值上传给读取进程
447 0
龙芯2K驱动开发——使用中断触发读取GPIO电平值上传给读取进程
|
3月前
|
监控 Linux 应用服务中间件
探索Linux中的`ps`命令:进程监控与分析的利器
探索Linux中的`ps`命令:进程监控与分析的利器
|
2月前
|
运维 关系型数据库 MySQL
掌握taskset:优化你的Linux进程,提升系统性能
在多核处理器成为现代计算标准的今天,运维人员和性能调优人员面临着如何有效利用这些处理能力的挑战。优化进程运行的位置不仅可以提高性能,还能更好地管理和分配系统资源。 其中,taskset命令是一个强大的工具,它允许管理员将进程绑定到特定的CPU核心,减少上下文切换的开销,从而提升整体效率。
掌握taskset:优化你的Linux进程,提升系统性能
|
2月前
|
弹性计算 Linux 区块链
Linux系统CPU异常占用(minerd 、tplink等挖矿进程)
Linux系统CPU异常占用(minerd 、tplink等挖矿进程)
66 4
Linux系统CPU异常占用(minerd 、tplink等挖矿进程)
|
1月前
|
算法 Linux 调度
探索进程调度:Linux内核中的完全公平调度器
【8月更文挑战第2天】在操作系统的心脏——内核中,进程调度算法扮演着至关重要的角色。本文将深入探讨Linux内核中的完全公平调度器(Completely Fair Scheduler, CFS),一个旨在提供公平时间分配给所有进程的调度器。我们将通过代码示例,理解CFS如何管理运行队列、选择下一个运行进程以及如何对实时负载进行响应。文章将揭示CFS的设计哲学,并展示其如何在现代多任务计算环境中实现高效的资源分配。
|
2月前
|
存储 缓存 安全
【Linux】冯诺依曼体系结构与操作系统及其进程
【Linux】冯诺依曼体系结构与操作系统及其进程
141 1
|
2月前
|
小程序 Linux
【编程小实验】利用Linux fork()与文件I/O:父进程与子进程协同实现高效cp命令(前半文件与后半文件并行复制)
这个小程序是在文件IO的基础上去结合父子进程的一个使用,利用父子进程相互独立的特点实现对数据不同的操作
|
2月前
|
SQL 自然语言处理 网络协议
【Linux开发实战指南】基于TCP、进程数据结构与SQL数据库:构建在线云词典系统(含注册、登录、查询、历史记录管理功能及源码分享)
TCP(Transmission Control Protocol)连接是互联网上最常用的一种面向连接、可靠的、基于字节流的传输层通信协议。建立TCP连接需要经过著名的“三次握手”过程: 1. SYN(同步序列编号):客户端发送一个SYN包给服务器,并进入SYN_SEND状态,等待服务器确认。 2. SYN-ACK:服务器收到SYN包后,回应一个SYN-ACK(SYN+ACKnowledgment)包,告诉客户端其接收到了请求,并同意建立连接,此时服务器进入SYN_RECV状态。 3. ACK(确认字符):客户端收到服务器的SYN-ACK包后,发送一个ACK包给服务器,确认收到了服务器的确
168 1