Linux内存管理-2

环境：Linux kernel v5.15

Buddy Allocator

内存分配

__alloc_pages()

mm/page_alloc.c

该函数是buddy allocator的核心逻辑。它主要调用了以下3个函数：

prepage_alloc_pages()，根据传入的内存申请信息设置struct alloc_context。
get_page_from_freelist()，分配物理内存的fast path。
__alloc_pages_slowpath()，分配物理内存的slow path。

prepage_alloc_pages()

mm/page_alloc.c

struct alloc_context是整个物理内存分配流程中作为函数参数传递的上下文，而该函数的主要任务就是根据分配参数设置该结构体，其定义如下：

struct alloc_context {
    // 如果preferred_zoneref不能满足分配请求，从该列表的zone中继续尝试
	struct zonelist *zonelist;
    // 过滤该次分配允许的node
	nodemask_t *nodemask;
    // 优先从该zone中分配物理内存
	struct zoneref *preferred_zoneref;
    // 本次分配的迁移类型
	int migratetype;

	/*
	 * highest_zoneidx represents highest usable zone index of
	 * the allocation request. Due to the nature of the zone,
	 * memory on lower zone than the highest_zoneidx will be
	 * protected by lowmem_reserve[highest_zoneidx].
	 *
	 * highest_zoneidx is also used by reclaim/compaction to limit
	 * the target zone since higher zone than this index cannot be
	 * usable for this allocation request.
	 */
    // 允许的最大zone序号，本次分配不能从序号更高的zone中分配内存
	enum zone_type highest_zoneidx;
    // 是否考虑脏页平衡
	bool spread_dirty_pages;
};

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
		int preferred_nid, nodemask_t *nodemask,
		struct alloc_context *ac, gfp_t *alloc_gfp,
		unsigned int *alloc_flags)
{
	ac->highest_zoneidx = gfp_zone(gfp_mask);
	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
	ac->nodemask = nodemask;
	ac->migratetype = gfp_migratetype(gfp_mask);

	if (cpusets_enabled()) {
		*alloc_gfp |= __GFP_HARDWALL;
		/*
		 * When we are in the interrupt context, it is irrelevant
		 * to the current task context. It means that any node ok.
		 */
		if (in_task() && !ac->nodemask)
			ac->nodemask = &cpuset_current_mems_allowed;
		else
			*alloc_flags |= ALLOC_CPUSET;
	}

	fs_reclaim_acquire(gfp_mask);
	fs_reclaim_release(gfp_mask);

	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

	if (should_fail_alloc_page(gfp_mask, order))
		return false;

	*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);

	/* Dirty zone balancing only done in the fast path */
	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

	/*
	 * The preferred zone is used for statistics but crucially it is
	 * also used as the starting point for the zonelist iterator. It
	 * may get reset for allocations that ignore memory policies.
	 */
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->highest_zoneidx, ac->nodemask);

	return true;
}

其中：

ac->zonelist就是描述node的结构struct pglist_data下的fieldnode_zonelist。对于UMA，每个node只有一个ZONELIST_FALLBACK，列出了所有可用zone；对于NUMA，每个node有ZONELIST_FALLBACK和ZONELIST_NOFALLBACK两个node_zonelist，分别列出当前node中的和其他node中的备用zone。这么设计的目的是支持__GFP_THISNODE，即不允许跨node的分配请求。
__GFP_WRITE标志代表调用者会写入该物理页，因此如果参数中该位置位，需要设置ac->spread_dirty_pages进行脏页平衡，即避免一个zone上有太多脏页。
如果没有设置nodemask，first_zones_zonelist()一般会返回ac->zonelist的首项，它会被赋给ac->preferred_zoneref。

get_page_from_list()

该函数是fast path的一部分，它从ac->preferred_zoneref开始，遍历ac->zonelist，跳过不符合条件的zone（如脏页数，watermark等），直至找到一个符合条件的zone并在其之上调用rmqueue()。

rmqueue()

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;

	if (likely(pcp_allowed_order(order))) {
		/*
		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
		 * we need to skip it when CMA area isn't allowed.
		 */
		if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
				migratetype != MIGRATE_MOVABLE) {
			page = rmqueue_pcplist(preferred_zone, zone, order,
					gfp_flags, migratetype, alloc_flags);
			goto out;
		}
	}

	/*
	 * We most definitely don't want callers attempting to
	 * allocate greater than order-1 page units with __GFP_NOFAIL.
	 */
	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
	spin_lock_irqsave(&zone->lock, flags);

	do {
		page = NULL;
		/*
		 * order-0 request can reach here when the pcplist is skipped
		 * due to non-CMA allocation context. HIGHATOMIC area is
		 * reserved for high-order atomic allocation, so order-0
		 * request should skip it.
		 */
		if (order > 0 && alloc_flags & ALLOC_HARDER) {
			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
			if (page)
				trace_mm_page_alloc_zone_locked(page, order, migratetype);
		}
		if (!page)
			page = __rmqueue(zone, order, migratetype, alloc_flags);
	} while (page && check_new_pages(page, order));
	if (!page)
		goto failed;

	__mod_zone_freepage_state(zone, -(1 << order),
				  get_pcppage_migratetype(page));
	spin_unlock_irqrestore(&zone->lock, flags);

	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	zone_statistics(preferred_zone, zone, 1);

out:
	/* Separate test+clear to avoid unnecessary atomics */
	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	}

	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
	return page;

failed:
	spin_unlock_irqrestore(&zone->lock, flags);
	return NULL;
}

对于order较小的分配请求（默认值：小于等于3），该函数调用rmqueue_pcplist()在per cpu list上分配；否则依次调用__rmqueue_smallest()和__rmqueue()进行分配。

rmqueue_pcplist()

该函数从线程局部变量（pcplist）中分配页。pcplist中的页会动态平衡，太多时归还给zone，太少时从zone取。

__rmqueue()

该函数调用__rmqueue_smallest()进行实际分配，如果失败，会通过__rmqueue_fallback()尝试从其他migrate type的zone->freearea中偷取物理页（每种migrate type的页有独立的freelist），然后重试。

__rmqueue_smallest()

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = get_page_from_free_area(area, migratetype);
		if (!page)
			continue;
		del_page_from_free_list(page, zone, current_order);
		expand(zone, page, order, current_order, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

该函数从满足条件的最小order开始往上遍历zone->free_area，直至找到满足大小条件的物理块为止。如果找到的是一个比要求更大的物理块，调用expand()把多余部分拆成小块，插入对应大小的链表。

__alloc_pages_slowpath()

在slow path中，会通过kswapd异步回收、reclaim（回收已被分配的内存）、compaction（消除内存碎片）甚至OOM杀死其他进程等方式试图获取更多空闲内存，并多次尝试调用get_page_from_list()进行分配。

内存释放

__free_pages()

释放物理页的入口点。在一般场景下，它递减并检查该页引用计数是否为0，然后调用free_the_page()。

free_the_page()

对于从线程局部变量pcplist中分配的页，调用free_unref_page()进行回收；对其他页调用__free_pages_ok()进行回收。

__free_pages_ok()

调用free_pages_prepare()校验后，调用__free_one_page()。

__free_one_page()

主要逻辑，将待回收的页插入所在zone的freelist。并尝试和buddy block合并，