环境:Linux kernel v5.15

内存模型

__pfn_to_page(pfn)__page_to_pfn(pg)用于在pfn(page frame number)和描述page frame的结构体struct page*之间转换。

在最早的flat memory model中,假设所有物理页是连续分布的,使用一个struct page*的指针数组(称为mem_map)就可以描述所有的物理内存。pfn和struct page*之间的转换就是简单的指针加减法。

如果内存中有空洞,flat memory model就会带来很大浪费,因为mem_map数组里也得留下空洞。于是discontiguous memory model出现了,NUMA之下的每个node都有一个struct page*数组(记录在每个node对应的struct pglist_data*之下)。为了转换pfn和struct page*,我们需要手段来判断pfn从属在哪个NUMA node底下。

然而,“物理内存连续与否”和“是不是NUMA架构”其实并没有必然的因果关系,不是说一个NUMA node之下的物理内存就一定要连续。discontiguous memory model还是借助了NUMA node的struct pglist_data*结构来管理物理内存分布,并假设一个NUMA node下的物理内存连续。为了解耦这两者的关系,sparse memory model出现了,它基本取代了discontiguous memory model。

sparse memory model使用新的结构struct mem_section*数组(也叫mem_sectionstruct mem_section **mem_section)来描述,每个struct mem_section有一个指向struct page*数组的指针section_mem_map(逻辑上是这样,物理上并不是,见sparse_init_one_section())。把mem_section实现成二维数组的好处显而易见:对物理内存地址中的空洞,把第一级指针设为NULL即可,减少了mem_section的空间占用。

一个mem_section对应128MB的物理内存(由SECTION_SIZE_BITS = 27 定义),定义PFN_SECTION_SHIFT = SECTION_SIZE_BITS - PAGE_SHIFT

1
2
3
#define SECTION_SIZE_BITS  27 /* matt - 128 is convenient right now */

#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)

于是pfn的低PFN_SECTION_SHIFT位成为单个mem_section内的偏移,可以使用pfn >> PFN_SECTION_SHIFT得到pfn所在的struct mem_section*指针(宏__pfn_to_section)。可以写出__pfn_to_page(pfn)__page_to_pfn(pg)如下:

1
2
3
4
5
6
7
8
9
10
11
#define __page_to_pfn(pg)					\
({ const struct page *__pg = (pg); \
int __sec = page_to_section(__pg); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})

#define __pfn_to_page(pfn) \
({ unsigned long __pfn = (pfn); \
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})

对SPARSEMEM的一个改进SPARSEMEM_VMEMMAP使我们可以利用简单的偏移加减实现pfn和struct page*的转换。

1
2
#define __pfn_to_page(pfn)	(vmemmap + (pfn))
#define __page_to_pfn(page) (unsigned long)((page) - vmemmap)

sparse_memory_model初始化过程

函数调用关系:

  • sparse_init()

    • memblocks_present()
      • memory_present()
    • sparse_init_nid()
      • sparse_init_one_section()

sparse_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/* mm/sparse.c */

/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
*/
void __init sparse_init(void)
{
unsigned long pnum_end, pnum_begin, map_count = 1;
int nid_begin;

memblocks_present();

pnum_begin = first_present_section_nr();
nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));

/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();

for_each_present_section_nr(pnum_begin + 1, pnum_end) {
int nid = sparse_early_nid(__nr_to_section(pnum_end));

if (nid == nid_begin) {
map_count++;
continue;
}
/* Init node with sections in range [pnum_begin, pnum_end) */
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
nid_begin = nid;
pnum_begin = pnum_end;
map_count = 1;
}
/* cover the last node */
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
}

sparse_init()是sparse_memory_model初始化的入口。

sparse_init()首先调用memblocks_present()为全局变量struct mem_section **mem_section分配空间。之后在每个node上调用sparse_init_nid(),该函数初始化编号为[pnum_begin, pnum_end]的mem_section(这个区间可能有空洞),它们在同一个node之下。

memblocks_present()和memory_present()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* mm/sparse.c */

/*
* Mark all memblocks as present using memory_present().
* This is a convenience function that is useful to mark all of the systems
* memory as present during initialization.
*/
static void __init memblocks_present(void)
{
unsigned long start, end;
int i, nid;

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
memory_present(nid, start, end);
}

/* Record a memory area against a node. */
static void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;

#ifdef CONFIG_SPARSEMEM_EXTREME
if (unlikely(!mem_section)) {
unsigned long size, align;

size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
if (!mem_section)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, size, align);
}
#endif

start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;

sparse_index_init(section, nid);
set_section_nid(section, nid);

ms = __nr_to_section(section);
if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
__section_mark_present(ms, section);
}
}
}

memblocks_present()对每个memblock使用memory_present()mem_section分配空间。

memory_present()第一次被调用时初始化并分配struct mem_section*数组,并在每次被调用时按需分配了struct mem_section(在sparse_index_init()中)。

memory_present()还用struct mem_sectionsection_mem_map暂存了它从属的node id,供sparse_init()使用。nid只在early init阶段用到,这样做减少了一个字段,节省空间。

sparse_init_nid()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* mm/sparse.c */

/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
*/
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
struct mem_section_usage *usage;
unsigned long pnum;
struct page *map;

usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
mem_section_usage_size() * map_count);
if (!usage) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
sparse_buffer_init(map_count * section_map_size(), nid);
for_each_present_section_nr(pnum_begin, pnum) {
unsigned long pfn = section_nr_to_pfn(pnum);

if (pnum >= pnum_end)
break;

map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
nid, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
SECTION_IS_EARLY);
usage = (void *) usage + mem_section_usage_size();
}
sparse_buffer_fini();
return;
failed:
/* We failed to allocate, mark all the following pnums as not present */
for_each_present_section_nr(pnum_begin, pnum) {
struct mem_section *ms;

if (pnum >= pnum_end)
break;
ms = __nr_to_section(pnum);
ms->section_mem_map = 0;
}
}

struct mem_section占用的空间已经在memory_present()中被分配好了,sparse_init_nid()的任务是初始化各个struct mem_section

sparse_early_usemaps_alloc_pgdat_section()struct mem_section::usage分配了空间。

__populate_section_memmap()struct mem_section::section_mem_map分配了空间,这正是实际存储struct page数组的地方。

sparse_init_one_section()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* mm/sparse.c */

static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
struct mem_section_usage *usage, unsigned long flags)
{
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
| SECTION_HAS_MEM_MAP | flags;
ms->usage = usage;
}

/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
* physical page frame number.
*/
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
unsigned long coded_mem_map =
(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
return coded_mem_map;
}

sparse_init_one_section()初始化一个struct mem_section

ms->section_mem_map存储的不是mem_map的原始值,而是原始值减去该mem_section在物理地址空间中的偏移量(以pfn记)。这样做的目的是让struct page*指针与ms->section_mem_map作差时得到在整个物理地址空间中的pfn偏移量,而不是该mem_section中的pfn偏移量,方便了__page_to_pfn()__pfn_to_page()的实现。此外,ms->section_mem_map的低位还被用来存放一些flag位。

物理内存管理

level struct desc
node struct pglist_data NUMA下物理内存被划分为不同node,UMA下只有一个node,即全局变量struct pglist_data contig_page_data;
zone struct zone 每个node下的物理内存被划分为不同zone,体现不同用途。整个系统里可能只有几个struct zone
page struct page 描述每个物理页(page frame)的结构,每个物理页都有一个struct page,寸土寸金。

[5]-Node-Zone-Page指出了node和zone之间的关系。与其说node和zone是top-down的从属关系,不如说它们是看待物理内存的不同视角。node从NUMA架构下内存亲和性的角度看待物理内存,zone是从用途的角度看待物理内存,见图:

1
2
3
4
5
6
7
Memory

16M 4G 6G
[ ZONE_DMA | ZONE_DMA32 | ZONE_NORMAL ]
3G
^ ^ ^
|<--- Node0 --->|<--- Node1 --->|

并不是每个node底下都必须有所有类型的zone。

node

每个NUMA node对应一个struct pglist_data结构体,一些重要成员有:

name type desc
node_zones struct zone数组,长度为MAX_NR_ZONES 该node下的zone
node_zonelists struct zonelist数组,长度为MAX_ZONELISTS 用于跨node的内存分配
nr_zones int 该node下zone的数量
node_start_pfn unsigned long long node的起始pfn号
node_present_pages unsigned long long node实际包括的page数目(不含空洞)
node_spanned_pages unsigned long long node横跨的page数目(包括空洞)

zone

每个NUMA node的物理内存都被划分为若干个zone,一些常见的zone类型有:

  • ZONE_DMA,一些设备地址线数目有限(如24位),进行DMA时只能使用低地址(如<16MB),划分出ZONE_DMA防止低地址的页被分出去导致设备无法进行DMA。
  • ZONE_DMA32,64位系统下,区分支持32位地址线和24位地址线的DMA设备。
  • ZONE_NORMAL,一般内存处于这个zone区域。
  • ZONE_HIGHMEM,32位系统虚拟地址空间太小,只有1G给内核态,因此只能映射最多1G物理内存。高端内存区用于建立到物理地址的临时映射,使内核态可以寻址更多物理地址。64位架构下虚拟地址空间足够大,不需要这个zone。

zone通过pfn和一些计数器描述它管理的物理地址空间:

  • zone_start_pfn,该zone所描述空间的起始页号。
  • spanned_pages,整个空间(包括空洞)占据的物理页数目,它等于zone_end_pfn - zone_start_pfn
  • present_pages,存在于空间中的物理页数目,等于spanned_pages减掉空洞中的页数目。
  • managed_pages,被buddy system管理的页数目,等于present_pages减掉被zone预留做其他用途的页数目。

zone还有一个重要成员struct free_area free_area[MAX_ORDER],这就是buddy system使用的空闲链表,是一个per zone的结构。它直接存储了指向struct page*的指针。

node和zone初始化过程

为node结构体struct pglist_data分配空间:

  • numa_init(dummy_numa_init)

    • dummy_numa_init()

    • numa_register_memblks()

初始化node和zone:

  • paging_init()
    • zone_sizes_init()
      • free_area_init()
        • free_area_init_node()
          • get_pfn_range_for_nid()
          • calculate_node_totalpages()
          • free_area_init_core()
        • memmap_init()

numa_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
static int __init numa_init(int (*init_func)(void))
{
int i;
int ret;

for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);

nodes_clear(numa_nodes_parsed);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
MAX_NUMNODES));
WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
MAX_NUMNODES));
/* In case that parsing SRAT failed. */
WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
numa_reset_distance();

ret = init_func();
if (ret < 0)
return ret;

/*
* We reset memblock back to the top-down direction
* here because if we configured ACPI_NUMA, we have
* parsed SRAT in init_func(). It is ok to have the
* reset here even if we did't configure ACPI_NUMA
* or acpi numa init fails and fallbacks to dummy
* numa init.
*/
memblock_set_bottom_up(false);

ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;

numa_emulation(&numa_meminfo, numa_distance_cnt);

ret = numa_register_memblks(&numa_meminfo);
if (ret < 0)
return ret;

for (i = 0; i < nr_cpu_ids; i++) {
int nid = early_cpu_to_node(i);

if (nid == NUMA_NO_NODE)
continue;
if (!node_online(nid))
numa_clear_node(i);
}
numa_init_array();

return 0;
}

numa_init()接收一个函数作为参数,在init_func()中调用。对于UMA架构,这个函数是dummy_numa_init(),它将整个内存作为nid=0的dummy node。

numa_register_memblks()为每个node对应的struct pglist_data分配空间。

paging_init()和zone_sizes_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void __init paging_init(void)
{
sparse_init();

/*
* clear the default setting with node 0
* note: don't use nodes_clear here, that is really clearing when
* numa support is not compiled in, and later node_set_state
* will not set it back.
*/
node_clear_state(0, N_MEMORY);
node_clear_state(0, N_NORMAL_MEMORY);

zone_sizes_init();
}

void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];

memset(max_zone_pfns, 0, sizeof(max_zone_pfns));

#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif

free_area_init(max_zone_pfns);
}

paging_init()调用sparse_init()初始化稀疏内存模型,调用zone_sizes_init()初始化每个node和zone。

zone_sizes_init()指定了整个地址空间中每个zone的pfn上界,并调用free_area_init()

free_area_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;

/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));

start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();

/*
确定每个zone的pfn区间。(与node无关)
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;

if (zone == ZONE_MOVABLE)
continue;

end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;

start_pfn = end_pfn;
}

/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();

/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}

/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}

/*
* Print out the early node map, and initialize the
* subsection-map relative to active online memory ranges to
* enable future "sub-section" extensions of the memory map.
*/
/*
这里打印了由memblock给出的,每个node的pfn range分布。一个node可能对应多个pfn range。
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}

/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);

/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}

memmap_init();
}

free_area_init()从zone和node的两个视角打印了物理内存的分布情况,并在每个node上调用free_area_init_node(nid)

free_area_init()还会在node和zone初始化完成后调用memmap_init(),使用__pfn_to_page()找到并初始化每个pfn对应的struct page结构体。

1
2
3
4
5
6
7
8
9
10
11
[    0.041272] Zone ranges:
[ 0.041314] DMA [mem 0x0000000000001000-0x0000000000ffffff]
[ 0.041387] DMA32 [mem 0x0000000001000000-0x00000000ffffffff]
[ 0.041404] Normal [mem 0x0000000100000000-0x000000023fffffff]
[ 0.041417] Device empty
[ 0.041441] Movable zone start for each node
[ 0.041476] Early memory node ranges
[ 0.041503] node 0: [mem 0x0000000000001000-0x000000000009efff]
[ 0.041649] node 0: [mem 0x0000000000100000-0x00000000bffdffff]
[ 0.041682] node 0: [mem 0x0000000100000000-0x000000013fffffff]
[ 0.041695] node 1: [mem 0x0000000140000000-0x000000023fffffff]

这是一个例子,node0和node1都是4GB的NUMA节点。

Zone ranges是从zone角度看待物理地址空间。低4G被用作DMA和DMA32,其他部分属于ZONE_NORMAL。如果可用物理内存少于4GB,就不会有ZONE_NORMAL。

Early memory node ranges是memblock给出的内存分布。node0被分为3个区间,node1全部在1个区间。把每个node的区间长度加起来确实是4GB。

注意:虽然物理内存总共有8GB,但物理地址空间的上界是0x240000000(9GB)。这是因为node0被分割了,给BIOS、设备等留出了地址空间。

free_area_init_node()和free_area_init_core()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;

/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

pgdat->node_id = nid;
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;

pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
calculate_node_totalpages(pgdat, start_pfn, end_pfn);

alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);

free_area_init_core(pgdat);
}

static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;

pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;

for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;

size = zone->spanned_pages;
freesize = zone->present_pages;

/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
pr_debug(" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}

/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
}

if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;

/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);

if (!size)
continue;

set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
}

free_area_init_node()使用get_pfn_range_for_nid()计算出当前node覆盖的地址空间,并用calculate_node_totalpages()计算该node下每个zone的spanned_pagespresent_pages

free_area_init_core()遍历每个zone,计算其managed_pages,并对每个zone结构体进行初始化。managed_pages主要是在present_pages的基础上去掉了mem_map(存放struct page)和DMA预留空间。

此外,free_area_init_core()还维护了nr_all_pagesnr_kernel_pages两个全局变量的值,它们的关系如下图:


  1. LZT的知乎专栏"深入理解Linux内存管理"(重要参考,系列)

  2. Linux中的memory model(参考,图片来源,系列)

  3. Linux内存模型之Sparse Memory Model(参考,图片来源,系列)

  4. Memory: the flat, the discontiguous, and the sparse及其译文(参考)

  5. Kernel Exploring(参考,图片来源)