环境:Linux kernel v5.15

Buddy Allocator

内存分配

__alloc_pages()

mm/page_alloc.c

该函数是buddy allocator的核心逻辑。它主要调用了以下3个函数:

  • prepage_alloc_pages(),根据传入的内存申请信息设置struct alloc_context
  • get_page_from_freelist(),分配物理内存的fast path。
  • __alloc_pages_slowpath(),分配物理内存的slow path。

prepage_alloc_pages()

mm/page_alloc.c

struct alloc_context是整个物理内存分配流程中作为函数参数传递的上下文,而该函数的主要任务就是根据分配参数设置该结构体,其定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct alloc_context {
// 如果preferred_zoneref不能满足分配请求,从该列表的zone中继续尝试
struct zonelist *zonelist;
// 过滤该次分配允许的node
nodemask_t *nodemask;
// 优先从该zone中分配物理内存
struct zoneref *preferred_zoneref;
// 本次分配的迁移类型
int migratetype;

/*
* highest_zoneidx represents highest usable zone index of
* the allocation request. Due to the nature of the zone,
* memory on lower zone than the highest_zoneidx will be
* protected by lowmem_reserve[highest_zoneidx].
*
* highest_zoneidx is also used by reclaim/compaction to limit
* the target zone since higher zone than this index cannot be
* usable for this allocation request.
*/
// 允许的最大zone序号,本次分配不能从序号更高的zone中分配内存
enum zone_type highest_zoneidx;
// 是否考虑脏页平衡
bool spread_dirty_pages;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfp_migratetype(gfp_mask);

if (cpusets_enabled()) {
*alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
*/
if (in_task() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}

fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

if (should_fail_alloc_page(gfp_mask, order))
return false;

*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);

/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);

return true;
}

其中:

  • ac->zonelist就是描述node的结构struct pglist_data下的fieldnode_zonelist。对于UMA,每个node只有一个ZONELIST_FALLBACK,列出了所有可用zone;对于NUMA,每个node有ZONELIST_FALLBACKZONELIST_NOFALLBACK两个node_zonelist,分别列出当前node中的和其他node中的备用zone。这么设计的目的是支持__GFP_THISNODE,即不允许跨node的分配请求。
  • __GFP_WRITE标志代表调用者会写入该物理页,因此如果参数中该位置位,需要设置ac->spread_dirty_pages进行脏页平衡,即避免一个zone上有太多脏页。
  • 如果没有设置nodemaskfirst_zones_zonelist()一般会返回ac->zonelist的首项,它会被赋给ac->preferred_zoneref

get_page_from_list()

该函数是fast path的一部分,它从ac->preferred_zoneref开始,遍历ac->zonelist,跳过不符合条件的zone(如脏页数,watermark等),直至找到一个符合条件的zone并在其之上调用rmqueue()

rmqueue()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;

if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
* we need to skip it when CMA area isn't allowed.
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
goto out;
}
}

/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);

do {
page = NULL;
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
if (!page)
goto failed;

__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);

out:
/* Separate test+clear to avoid unnecessary atomics */
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}

VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;

failed:
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}

对于order较小的分配请求(默认值:小于等于3),该函数调用rmqueue_pcplist()在per cpu list上分配;否则依次调用__rmqueue_smallest()__rmqueue()进行分配。

rmqueue_pcplist()

该函数从线程局部变量(pcplist)中分配页。pcplist中的页会动态平衡,太多时归还给zone,太少时从zone取。

__rmqueue()

该函数调用__rmqueue_smallest()进行实际分配,如果失败,会通过__rmqueue_fallback()尝试从其他migrate type的zone->freearea中偷取物理页(每种migrate type的页有独立的freelist),然后重试。

__rmqueue_smallest()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;

/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}

return NULL;
}

该函数从满足条件的最小order开始往上遍历zone->free_area,直至找到满足大小条件的物理块为止。如果找到的是一个比要求更大的物理块,调用expand()把多余部分拆成小块,插入对应大小的链表。

__alloc_pages_slowpath()

在slow path中,会通过kswapd异步回收、reclaim(回收已被分配的内存)、compaction(消除内存碎片)甚至OOM杀死其他进程等方式试图获取更多空闲内存,并多次尝试调用get_page_from_list()进行分配。

内存释放

__free_pages()

释放物理页的入口点。在一般场景下,它递减并检查该页引用计数是否为0,然后调用free_the_page()

free_the_page()

对于从线程局部变量pcplist中分配的页,调用free_unref_page()进行回收;对其他页调用__free_pages_ok()进行回收。

__free_pages_ok()

调用free_pages_prepare()校验后,调用__free_one_page()

__free_one_page()

主要逻辑,将待回收的页插入所在zone的freelist。并尝试和buddy block合并,

函数关系图


  1. 【原创】(七)Linux内存管理 - zoned page frame allocator - 2
  2. 深入理解Linux内存管理(五)伙伴系统介绍