内核地址空间

main函数使用kinit1kvmalloc函数初始化内核地址空间并准备内核页表。xv6使用从end开始,到PHYSTOP部分的内存作为free memory pool。这里end在ld script(kernel.ld)中定义,是紧随bss节,也就是紧随加载到内存中的kernel elf文件末尾的内存位置。

kinit1

kinit1使用freerange对[end, KERNELBASE+4MB]部分的物理内存做初始化操作,这是因为在之前的enrtypgdir里创建的初始页表导致内核目前只能使用最多到KERNELBASE+4MB部分的内存。初始化这部分内存后,kalloc就有了可供分配的free page。kvmalloc需要用kalloc请求free page来构建页表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void
kinit1(void *vstart, void *vend)
{
initlock(&kmem.lock, "kmem");
kmem.use_lock = 0;
freerange(vstart, vend);
}

void
freerange(void *vstart, void *vend)
{
char *p;
p = (char*)PGROUNDUP((uint)vstart);
for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
kfree(p);
}

所谓初始化,就是对这一区间的每一页(4KB)物理内存调用kfreekfree将这一页记录到全局struct kmem维护的freelist中。freelist是一个struct run的链表,run这个名字说明next指针会在kfree执行时被存放在每一页的头部。这里对kfree的使用也是唯一的例外情况,正常情况下kfree只能用于kalloc分配的内存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct run {
struct run *next;
};

struct {
struct spinlock lock;
int use_lock;
struct run *freelist;
} kmem;

// Free the page of physical memory pointed at by v,
// which normally should have been returned by a
// call to kalloc(). (The exception is when
// initializing the allocator; see kinit above.)
void
kfree(char *v)
{
struct run *r;

if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");

// Fill with junk to catch dangling refs.
memset(v, 1, PGSIZE);

if(kmem.use_lock)
acquire(&kmem.lock);
r = (struct run*)v;
r->next = kmem.freelist;
kmem.freelist = r;
if(kmem.use_lock)
release(&kmem.lock);
}

kvmalloc

kvmalloc的工作就是创建页表的内核部分,并将新页表装入CR3寄存器。

1
2
3
4
5
6
7
8
// Allocate one page table for the machine for the kernel address
// space for scheduler processes.
void
kvmalloc(void)
{
kpgdir = setupkvm();
switchkvm();
}

页表的内核部分由kmap定义,此处的data和之前的end一样,也是ld script中定义的。

1
2
3
4
5
6
7
8
9
10
11
12
13
// This table defines the kernel's mappings, which are present in
// every process's page table.
static struct kmap {
void *virt;
uint phys_start;
uint phys_end;
int perm;
} kmap[] = {
{ (void*)KERNBASE, 0, EXTMEM, PTE_W}, // I/O space
{ (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0}, // kern text+rodata
{ (void*)data, V2P(data), PHYSTOP, PTE_W}, // kern data+memory
{ (void*)DEVSPACE, DEVSPACE, 0, PTE_W}, // more devices
};
setupkvm()

setupkvm通过mappages创建页表的内核部分,完成kvmalloc的主要工作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// Set up kernel part of a page table.
pde_t*
setupkvm(void)
{
pde_t *pgdir;
struct kmap *k;

if((pgdir = (pde_t*)kalloc()) == 0)
return 0;
memset(pgdir, 0, PGSIZE);
if (P2V(PHYSTOP) > (void*)DEVSPACE)
panic("PHYSTOP too high");
for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
if(mappages(pgdir, k->virt, k->phys_end - k->phys_start,
(uint)k->phys_start, k->perm) < 0) {
freevm(pgdir);
return 0;
}
return pgdir;
}
switchkvm()

一般来说,页表包含kernel和user两个部分。特别地,kvmalloc调用setupkvm的结果会被存入全局变量kpgdir:这是一个特殊的页表,只有内核部分而没有用户部分。switchkvm的工作就是将这个kpgdir这个特殊页表装入CR3寄存器,表明此时没有用户进程正在CPU上运行(现在CPU还在初始化过程中)。

1
2
3
4
5
6
7
// Switch h/w page table register to the kernel-only page table,
// for when no process is running.
void
switchkvm(void)
{
lcr3(V2P(kpgdir)); // switch to the kernel page table
}
工具函数

mappages负责在va和pa指向的长为size的虚拟/物理地址之间逐页建立映射。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// Create PTEs for virtual addresses starting at va that refer to
// physical addresses starting at pa. va and size might not
// be page-aligned.
static int
mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
{
char *a, *last;
pte_t *pte;

a = (char*)PGROUNDDOWN((uint)va);
last = (char*)PGROUNDDOWN(((uint)va) + size - 1);
for(;;){
if((pte = walkpgdir(pgdir, a, 1)) == 0)
return -1;
if(*pte & PTE_P)
panic("remap");
*pte = pa | perm | PTE_P;
if(a == last)
break;
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}

walkpgdir在page directory中遍历寻找page table entry项。如果没有指向对应page table的page directory entry,还需要创建它。pde中存放的应该是page table的物理地址,所以这里还涉及到P2V和V2P的转换。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// Return the address of the PTE in page table pgdir
// that corresponds to virtual address va. If alloc!=0,
// create any required page table pages.
static pte_t *
walkpgdir(pde_t *pgdir, const void *va, int alloc)
{
pde_t *pde;
pte_t *pgtab;

pde = &pgdir[PDX(va)];
if(*pde & PTE_P){
pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
} else {
if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
memset(pgtab, 0, PGSIZE);
// The permissions here are overly generous, but they can
// be further restricted by the permissions in the page table
// entries, if necessary.
*pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
}
return &pgtab[PTX(va)];
}

用户地址空间

创建一个进程有两种方式:

fork()

fork使用allocproc分配进程的数据结构,使用copyuvm复制父进程的地址空间。fork对于父进程而言就是一个普通的系统调用,走正常的syscall流程返回。对于子进程而言,fork返回值为0(28行),并且会先运行forkret,然后再和父进程一样从系统调用返回(因为trap frame是copy过来的)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// Create a new process copying p as the parent.
// Sets up stack to return as if from system call.
// Caller must set state of returned proc to RUNNABLE.
int
fork(void)
{
int i, pid;
struct proc *np;
struct proc *curproc = myproc();

// Allocate process.
if((np = allocproc()) == 0){
return -1;
}

// Copy process state from proc.
if((np->pgdir = copyuvm(curproc->pgdir, curproc->sz)) == 0){
kfree(np->kstack);
np->kstack = 0;
np->state = UNUSED;
return -1;
}
np->sz = curproc->sz;
np->parent = curproc;
*np->tf = *curproc->tf;

// Clear %eax so that fork returns 0 in the child.
np->tf->eax = 0;

for(i = 0; i < NOFILE; i++)
if(curproc->ofile[i])
np->ofile[i] = filedup(curproc->ofile[i]);
np->cwd = idup(curproc->cwd);

safestrcpy(np->name, curproc->name, sizeof(curproc->name));

pid = np->pid;

acquire(&ptable.lock);

np->state = RUNNABLE;

release(&ptable.lock);

return pid;
}
allocproc()

allocproc为进程在进程表中创建数据结构,并设置内核栈。内核栈中需要设置:

  1. 初始context。调度器使用,新进程一开始会被schedule到这个上下文。这里设置%eip为forkret,新进程从该函数开始执行。
  2. forkret没有经历正常的函数调用流程(call),于是它的返回地址应该是其执行时%esp指向的位置上的值。这个位置紧邻context之后,allocproc通过把trapret的函数地址放在这里,使得forkret会返回到trapret处执行。
  3. trapret从此时的栈(kstack中紧邻&trapret的位置)恢复trap frame,返回到用户态。allocproc的调用者负责设置trap frame。比如对于fork而言,trap frame就是直接从父进程copy(返回值除外,子进程为0),让子进程返回到用户态后处于与父进程一样的状态。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
// Look in the process table for an UNUSED proc.
// If found, change state to EMBRYO and initialize
// state required to run in the kernel.
// Otherwise return 0.
static struct proc*
allocproc(void)
{
struct proc *p;
char *sp;

acquire(&ptable.lock);

for(p = ptable.proc; p < &ptable.proc[NPROC]; p++)
if(p->state == UNUSED)
goto found;

release(&ptable.lock);
return 0;

found:
p->state = EMBRYO;
p->pid = nextpid++;

release(&ptable.lock);

// Allocate kernel stack.
if((p->kstack = kalloc()) == 0){
p->state = UNUSED;
return 0;
}
sp = p->kstack + KSTACKSIZE;

// Leave room for trap frame.
sp -= sizeof *p->tf;
p->tf = (struct trapframe*)sp;

// Set up new context to start executing at forkret,
// which returns to trapret.
sp -= 4;
*(uint*)sp = (uint)trapret;

sp -= sizeof *p->context;
p->context = (struct context*)sp;
memset(p->context, 0, sizeof *p->context);
p->context->eip = (uint)forkret;

return p;
}
copyuvm()

copyuvm没有采用copy-on-write,首先使用setupkvm复制内核页表,然后简单地逐页复制父进程的user space。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// Given a parent process's page table, create a copy
// of it for a child.
pde_t*
copyuvm(pde_t *pgdir, uint sz)
{
pde_t *d;
pte_t *pte;
uint pa, i, flags;
char *mem;

if((d = setupkvm()) == 0)
return 0;
for(i = 0; i < sz; i += PGSIZE){
if((pte = walkpgdir(pgdir, (void *) i, 0)) == 0)
panic("copyuvm: pte should exist");
if(!(*pte & PTE_P))
panic("copyuvm: page not present");
pa = PTE_ADDR(*pte);
flags = PTE_FLAGS(*pte);
if((mem = kalloc()) == 0)
goto bad;
memmove(mem, (char*)P2V(pa), PGSIZE);
if(mappages(d, (void*)i, PGSIZE, V2P(mem), flags) < 0) {
kfree(mem);
goto bad;
}
}
return d;

bad:
freevm(d);
return 0;
}

userinit()

userinit创建第一个进程init,同样使用setupkvm准备内核页表,不过用户页表部分没法copy,使用inituvm在用户空间创建供/init使用的页表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
void
userinit(void)
{
struct proc *p;
extern char _binary_initcode_start[], _binary_initcode_size[];

p = allocproc();

initproc = p;
if((p->pgdir = setupkvm()) == 0)
panic("userinit: out of memory?");
inituvm(p->pgdir, _binary_initcode_start, (int)_binary_initcode_size);
p->sz = PGSIZE;
memset(p->tf, 0, sizeof(*p->tf));
p->tf->cs = (SEG_UCODE << 3) | DPL_USER;
p->tf->ds = (SEG_UDATA << 3) | DPL_USER;
p->tf->es = p->tf->ds;
p->tf->ss = p->tf->ds;
p->tf->eflags = FL_IF;
p->tf->esp = PGSIZE;
p->tf->eip = 0; // beginning of initcode.S

safestrcpy(p->name, "initcode", sizeof(p->name));
p->cwd = namei("/");

// this assignment to p->state lets other cores
// run this process. the acquire forces the above
// writes to be visible, and the lock is also needed
// because the assignment might not be atomic.
acquire(&ptable.lock);

p->state = RUNNABLE;

release(&ptable.lock);
}
inituvm()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
// Load the initcode into address 0 of pgdir.
// sz must be less than a page.
void
inituvm(pde_t *pgdir, char *init, uint sz)
{
char *mem;

if(sz >= PGSIZE)
panic("inituvm: more than a page");
mem = kalloc();
memset(mem, 0, PGSIZE);
mappages(pgdir, 0, PGSIZE, V2P(mem), PTE_W|PTE_U);
memmove(mem, init, sz);
}

exec()

exec的工作是将指定的elf文件读入内存,在读入各segment后,使用allocuvm分配了两页,并使用clearpteu将第一页的页表映射取消。第一页成为guard页,而第二页成为用户栈。之后在用户栈上构建main函数的argc和argv参数,并准备好返回地址(fake),使得栈的状态看起来像是main函数被调用了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// Allocate two pages at the next page boundary.
// Make the first inaccessible. Use the second as the user stack.
sz = PGROUNDUP(sz);
if((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0)
goto bad;
clearpteu(pgdir, (char*)(sz - 2*PGSIZE));
sp = sz;

// Push argument strings, prepare rest of stack in ustack.
for(argc = 0; argv[argc]; argc++) {
if(argc >= MAXARG)
goto bad;
sp = (sp - (strlen(argv[argc]) + 1)) & ~3;
if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)
goto bad;
ustack[3+argc] = sp;
}
ustack[3+argc] = 0;

ustack[0] = 0xffffffff; // fake return PC
ustack[1] = argc;
ustack[2] = sp - (argc+1)*4; // argv pointer

sp -= (3+argc+1) * 4;
if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
goto bad;

最后将新的地址空间装入当前进程的状态,并调用switchuvm进行夺舍。

1
2
3
4
5
6
7
8
// Commit to the user image.
oldpgdir = curproc->pgdir;
curproc->pgdir = pgdir;
curproc->sz = sz;
curproc->tf->eip = elf.entry; // main
curproc->tf->esp = sp;
switchuvm(curproc);
freevm(oldpgdir);
switchuvm()

switchuvm在TSS门中设置SS和ESP寄存器,进程从用户态切换到内核态时从这里获得内核栈的地址——TSS门只需要有一个即可,切换进程时把里面的SS和ESP换成对应进程的内核栈。最后通过切换到p的pgdir完成用户地址空间的切换。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// Switch TSS and h/w page table to correspond to process p.
void
switchuvm(struct proc *p)
{
if(p == 0)
panic("switchuvm: no process");
if(p->kstack == 0)
panic("switchuvm: no kstack");
if(p->pgdir == 0)
panic("switchuvm: no pgdir");

pushcli();
mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts,
sizeof(mycpu()->ts)-1, 0);
mycpu()->gdt[SEG_TSS].s = 0;
mycpu()->ts.ss0 = SEG_KDATA << 3;
mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE;
// setting IOPL=0 in eflags *and* iomb beyond the tss segment limit
// forbids I/O instructions (e.g., inb and outb) from user space
mycpu()->ts.iomb = (ushort) 0xFFFF;
ltr(SEG_TSS << 3);
lcr3(V2P(p->pgdir)); // switch to process's address space
popcli();
}