sortix-mirror/kernel/x86-family/memorymanagement.cpp

819 lines
20 KiB
C++
Raw Normal View History

/*******************************************************************************
2015-03-16 16:24:42 +00:00
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2014, 2015.
This file is part of Sortix.
Sortix is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
Sortix is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details.
You should have received a copy of the GNU General Public License along with
Sortix. If not, see <http://www.gnu.org/licenses/>.
x86-family/memorymanagement.cpp
Handles memory for the x86 family of architectures.
*******************************************************************************/
#include <assert.h>
2012-09-22 14:44:50 +00:00
#include <errno.h>
#include <string.h>
2013-01-08 23:41:35 +00:00
2013-10-27 00:42:10 +00:00
#include <sortix/mman.h>
#include <sortix/kernel/kernel.h>
#include <sortix/kernel/kthread.h>
#include <sortix/kernel/memorymanagement.h>
#include <sortix/kernel/panic.h>
#include <sortix/kernel/pat.h>
2013-10-27 00:42:10 +00:00
#include <sortix/kernel/syscall.h>
#include "multiboot.h"
#include "memorymanagement.h"
#include "msr.h"
namespace Sortix {
extern size_t end;
} // namespace Sortix
namespace Sortix {
namespace Page {
void InitPushRegion(addr_t position, size_t length);
2015-03-16 16:24:42 +00:00
size_t pagesnotonstack = 0;
size_t stackused = 0;
size_t stackreserved = 0;
size_t stacklength = 4096 / sizeof(addr_t);
size_t totalmem = 0;
size_t page_usage_counts[PAGE_USAGE_NUM_KINDS];
2015-03-16 16:24:42 +00:00
kthread_mutex_t pagelock = KTHREAD_MUTEX_INITIALIZER;
} // namespace Page
} // namespace Sortix
namespace Sortix {
namespace Memory {
addr_t PAT2PMLFlags[PAT_NUM];
void Init(multiboot_info_t* bootinfo)
{
addr_t kernelend = Page::AlignUp((addr_t) &end);
2015-03-16 16:24:42 +00:00
if ( !(bootinfo->flags & MULTIBOOT_INFO_MEM_MAP) )
Panic("The memory map flag was't set in the multiboot structure.");
// If supported, setup the Page Attribute Table feature that allows
// us to control the memory type (caching) of memory more precisely.
if ( IsPATSupported() )
{
InitializePAT();
for ( addr_t i = 0; i < PAT_NUM; i++ )
PAT2PMLFlags[i] = EncodePATAsPMLFlag(i);
}
2015-03-16 16:24:42 +00:00
// Otherwise, reroute all requests to the backwards compatible scheme.
// TODO: Not all early 32-bit x86 CPUs supports these values.
else
{
PAT2PMLFlags[PAT_UC] = PML_WRTHROUGH | PML_NOCACHE;
PAT2PMLFlags[PAT_WC] = PML_WRTHROUGH | PML_NOCACHE; // Approx.
PAT2PMLFlags[2] = 0; // No such flag.
PAT2PMLFlags[3] = 0; // No such flag.
PAT2PMLFlags[PAT_WT] = PML_WRTHROUGH;
PAT2PMLFlags[PAT_WP] = PML_WRTHROUGH; // Approx.
PAT2PMLFlags[PAT_WB] = 0;
PAT2PMLFlags[PAT_UCM] = PML_NOCACHE;
}
typedef const multiboot_memory_map_t* mmap_t;
// Loop over every detected memory region.
for (
mmap_t mmap = (mmap_t) (addr_t) bootinfo->mmap_addr;
(addr_t) mmap < bootinfo->mmap_addr + bootinfo->mmap_length;
mmap = (mmap_t) ((addr_t) mmap + mmap->size + sizeof(mmap->size))
)
{
// Check that we can use this kind of RAM.
if ( mmap->type != 1 )
continue;
// The kernel's code may split this memory area into multiple pieces.
addr_t base = (addr_t) mmap->addr;
size_t length = Page::AlignDown(mmap->len);
#if defined(__i386__)
// Figure out if the memory area is addressable (are our pointers big enough?)
if ( 0xFFFFFFFFULL < mmap->addr )
continue;
if ( 0xFFFFFFFFULL < mmap->addr + mmap->len )
length = 0x100000000ULL - mmap->addr;
#endif
// Count the amount of usable RAM (even if reserved for kernel).
Page::totalmem += length;
// Give all the physical memory to the physical memory allocator
// but make sure not to give it things we already use.
addr_t regionstart = base;
addr_t regionend = base + length;
addr_t processed = regionstart;
while ( processed < regionend )
{
addr_t lowest = processed;
addr_t highest = regionend;
// Don't allocate the kernel.
if ( lowest < kernelend )
{
processed = kernelend;
continue;
}
// Don't give any of our modules to the physical page
// allocator, we'll need them.
bool continuing = false;
uint32_t* modules = (uint32_t*) (addr_t) bootinfo->mods_addr;
for ( uint32_t i = 0; i < bootinfo->mods_count; i++ )
{
size_t modsize = (size_t) (modules[2*i+1] - modules[2*i+0]);
addr_t modstart = (addr_t) modules[2*i+0];
addr_t modend = modstart + modsize;
if ( modstart <= processed && processed < modend )
{
processed = modend;
continuing = true;
break;
}
if ( lowest <= modstart && modstart < highest )
highest = modstart;
}
if ( continuing )
continue;
if ( highest <= lowest )
break;
Multithreaded kernel and improvement of signal handling. Pardon the big ass-commit, this took months to develop and debug and the refactoring got so far that a clean merge became impossible. The good news is that this commit does quite a bit of cleaning up and generally improves the kernel quality. This makes the kernel fully pre-emptive and multithreaded. This was done by rewriting the interrupt code, the scheduler, introducing new threading primitives, and rewriting large parts of the kernel. During the past few commits the kernel has had its device drivers thread secured; this commit thread secures large parts of the core kernel. There still remains some parts of the kernel that is _not_ thread secured, but this is not a problem at this point. Each user-space thread has an associated kernel stack that it uses when it goes into kernel mode. This stack is by default 8 KiB since that value works for me and is also used by Linux. Strange things tends to happen on x86 in case of a stack overflow - there is no ideal way to catch such a situation right now. The system call conventions were changed, too. The %edx register is now used to provide the errno value of the call, instead of the kernel writing it into a registered global variable. The system call code has also been updated to better reflect the native calling conventions: not all registers have to be preserved. This makes system calls faster and simplifies the assembly. In the kernel, there is no longer the event.h header or the hacky method of 'resuming system calls' that closely resembles cooperative multitasking. If a system call wants to block, it should just block. The signal handling was also improved significantly. At this point, signals cannot interrupt kernel threads (but can always interrupt user-space threads if enabled), which introduces some problems with how a SIGINT could interrupt a blocking read, for instance. This commit introduces and uses a number of new primitives such as kthread_lock_mutex_signal() that attempts to get the lock but fails if a signal is pending. In this manner, the kernel is safer as kernel threads cannot be shut down inconveniently, but in return for complexity as blocking operations must check they if they should fail. Process exiting has also been refactored significantly. The _exit(2) system call sets the exit code and sends SIGKILL to all the threads in the process. Once all the threads have cleaned themselves up and exited, a worker thread calls the process's LastPrayer() method that unmaps memory, deletes the address space, notifies the parent, etc. This provides a very robust way to terminate processes as even half-constructed processes (during a failing fork for instance) can be gracefully terminated. I have introduced a number of kernel threads to help avoid threading problems and simplify kernel design. For instance, there is now a functional generic kernel worker thread that any kernel thread can schedule jobs for. Interrupt handlers run with interrupts off (hence they cannot call kthread_ functions as it may deadlock the system if another thread holds the lock) therefore they cannot use the standard kernel worker threads. Instead, they use a special purpose interrupt worker thread that works much like the generic one expect that interrupt handlers can safely queue work with interrupts off. Note that this also means that interrupt handlers cannot allocate memory or print to the kernel log/screen as such mechanisms uses locks. I'll introduce a lock free algorithm for such cases later on. The boot process has also changed. The original kernel init thread in kernel.cpp creates a new bootstrap thread and becomes the system idle thread. Note that pid=0 now means the kernel, as there is no longer a system idle process. The bootstrap thread launches all the kernel worker threads and then creates a new process and loads /bin/init into it and then creates a thread in pid=1, which starts the system. The bootstrap thread then quietly waits for pid=1 to exit after which it shuts down/reboots/panics the system. In general, the introduction of race conditions and dead locks have forced me to revise a lot of the design and make sure it was thread secure. Since early parts of the kernel was quite hacky, I had to refactor such code. So it seems that the risk of dead locks forces me to write better code. Note that a real preemptive multithreaded kernel simplifies the construction of blocking system calls. My hope is that this will trigger a clean up of the filesystem code that current is almost beyond repair. Almost all of the kernel was modified during this refactoring. To the extent possible, these changes have been backported to older non-multithreaded kernel, but many changes were tightly coupled and went into this commit. Of interest is the implementation of the kthread_ api based on the design of pthreads; this library allows easy synchronization mechanisms and includes C++-style scoped locks. This commit also introduces new worker threads and tested mechanisms for interrupt handlers to schedule work in a kernel worker thread. A lot of code have been rewritten from scratch and has become a lot more stable and correct. Share and enjoy!
2012-08-01 15:30:34 +00:00
// Now that we have a continious area not used by anything,
// let's forward it to the physical page allocator.
lowest = Page::AlignUp(lowest);
highest = Page::AlignUp(highest);
size_t size = highest - lowest;
Page::InitPushRegion(lowest, size);
processed = highest;
}
}
// Prepare the non-forkable kernel PMLs such that forking the kernel address
// space will always keep the kernel mapped.
for ( size_t i = ENTRIES / 2; i < ENTRIES; i++ )
{
PML* const pml = PMLS[TOPPMLLEVEL];
if ( pml->entry[i] & PML_PRESENT )
continue;
addr_t page = Page::Get(PAGE_USAGE_PAGING_OVERHEAD);
if ( !page )
Panic("Out of memory allocating boot PMLs.");
pml->entry[i] = page | PML_WRITABLE | PML_PRESENT;
// Invalidate the new PML and reset it to zeroes.
addr_t pmladdr = (addr_t) (PMLS[TOPPMLLEVEL-1] + i);
InvalidatePage(pmladdr);
memset((void*) pmladdr, 0, sizeof(PML));
}
}
void Statistics(size_t* amountused, size_t* totalmem)
{
size_t memfree = (Page::stackused - Page::stackreserved) << 12UL;
size_t memused = Page::totalmem - memfree;
if ( amountused )
*amountused = memused;
if ( totalmem )
*totalmem = Page::totalmem;
}
} // namespace Memory
} // namespace Sortix
namespace Sortix {
namespace Page {
void PageUsageRegisterUse(addr_t where, enum page_usage usage)
{
if ( PAGE_USAGE_NUM_KINDS <= usage )
return;
(void) where;
page_usage_counts[usage]++;
}
void PageUsageRegisterFree(addr_t where, enum page_usage usage)
{
if ( PAGE_USAGE_NUM_KINDS <= usage )
return;
(void) where;
assert(page_usage_counts[usage] != 0);
page_usage_counts[usage]--;
}
void ExtendStack()
{
// This call will always succeed, if it didn't, then the stack
// wouldn't be full, and thus this function won't be called.
addr_t page = GetUnlocked(PAGE_USAGE_PHYSICAL);
// This call will also succeed, since there are plenty of physical
// pages available and it might need some.
addr_t virt = (addr_t) (STACK + stacklength);
if ( !Memory::Map(page, virt, PROT_KREAD | PROT_KWRITE) )
Panic("Unable to extend page stack, which should have worked");
// TODO: This may not be needed during the boot process!
//Memory::InvalidatePage((addr_t) (STACK + stacklength));
stacklength += 4096UL / sizeof(addr_t);
}
void InitPushRegion(addr_t position, size_t length)
{
// Align our entries on page boundaries.
addr_t newposition = Page::AlignUp(position);
length = Page::AlignDown((position + length) - newposition);
position = newposition;
while ( length )
{
if ( unlikely(stackused == stacklength) )
{
if ( stackused == MAXSTACKLENGTH )
{
pagesnotonstack += length / 4096UL;
return;
}
ExtendStack();
}
addr_t* stackentry = &(STACK[stackused++]);
*stackentry = position;
length -= 4096UL;
position += 4096UL;
}
}
bool ReserveUnlocked(size_t* counter, size_t least, size_t ideal)
{
assert(least < ideal);
size_t available = stackused - stackreserved;
if ( least < available )
return errno = ENOMEM, false;
if ( available < ideal )
ideal = available;
stackreserved += ideal;
*counter += ideal;
return true;
}
bool Reserve(size_t* counter, size_t least, size_t ideal)
{
ScopedLock lock(&pagelock);
return ReserveUnlocked(counter, least, ideal);
}
bool ReserveUnlocked(size_t* counter, size_t amount)
{
return ReserveUnlocked(counter, amount, amount);
}
bool Reserve(size_t* counter, size_t amount)
{
ScopedLock lock(&pagelock);
return ReserveUnlocked(counter, amount);
}
addr_t GetReservedUnlocked(size_t* counter, enum page_usage usage)
{
if ( !*counter )
return 0;
assert(stackused); // After all, we did _reserve_ the memory.
addr_t result = STACK[--stackused];
assert(result == AlignDown(result));
stackreserved--;
(*counter)--;
PageUsageRegisterUse(result, usage);
return result;
}
addr_t GetReserved(size_t* counter, enum page_usage usage)
{
ScopedLock lock(&pagelock);
return GetReservedUnlocked(counter, usage);
}
addr_t GetUnlocked(enum page_usage usage)
{
assert(stackreserved <= stackused);
if ( unlikely(stackreserved == stackused) )
return errno = ENOMEM, 0;
addr_t result = STACK[--stackused];
assert(result == AlignDown(result));
PageUsageRegisterUse(result, usage);
return result;
}
addr_t Get(enum page_usage usage)
{
ScopedLock lock(&pagelock);
return GetUnlocked(usage);
}
2015-01-17 21:17:51 +00:00
// TODO: This competes with the normal allocation for precious 32-bit pages, we
// should use different pools for this, and preferably preallocate some
// 32-bit pages exclusively for driver usage. Also, get proper hardware
// without these issues.
addr_t Get32BitUnlocked(enum page_usage usage)
{
assert(stackreserved <= stackused);
if ( unlikely(stackreserved == stackused) )
return errno = ENOMEM, 0;
for ( size_t ii = stackused; 0 < ii; ii-- )
{
size_t i = ii - 1;
addr_t result = STACK[i];
assert(result == AlignDown(result));
if ( 4 < sizeof(void*) && UINT32_MAX < result )
continue;
if ( i + 1 != stackused )
{
STACK[i] = STACK[stackused - 1];
STACK[stackused - 1] = result;
}
stackused--;
PageUsageRegisterUse(result, usage);
return result;
}
return errno = ENOMEM, 0;
}
addr_t Get32Bit(enum page_usage usage)
{
ScopedLock lock(&pagelock);
return Get32BitUnlocked(usage);
}
void PutUnlocked(addr_t page, enum page_usage usage)
{
assert(page == AlignDown(page));
if ( unlikely(stackused == stacklength) )
{
if ( stackused == MAXSTACKLENGTH )
{
pagesnotonstack++;
return;
}
ExtendStack();
}
STACK[stackused++] = page;
PageUsageRegisterFree(page, usage);
}
void Put(addr_t page, enum page_usage usage)
{
ScopedLock lock(&pagelock);
PutUnlocked(page, usage);
}
void Lock()
{
kthread_mutex_lock(&pagelock);
}
void Unlock()
{
kthread_mutex_unlock(&pagelock);
}
} // namespace Page
} // namespace Sortix
namespace Sortix {
namespace Memory {
addr_t ProtectionToPMLFlags(int prot)
{
addr_t result = PML_NX;
if ( prot & PROT_EXEC )
{
result |= PML_USERSPACE;
result &= ~PML_NX;
}
if ( prot & PROT_READ )
result |= PML_USERSPACE;
if ( prot & PROT_WRITE )
result |= PML_USERSPACE | PML_WRITABLE;
if ( prot & PROT_KEXEC )
result &= ~PML_NX;
if ( prot & PROT_KREAD )
result |= 0;
if ( prot & PROT_KWRITE )
result |= PML_WRITABLE;
if ( prot & PROT_FORK )
result |= PML_FORK;
return result;
}
int PMLFlagsToProtection(addr_t flags)
{
int prot = PROT_KREAD;
if ( (flags & PML_USERSPACE) && !(flags & PML_NX) )
prot |= PROT_EXEC;
if ( (flags & PML_USERSPACE) )
prot |= PROT_READ;
if ( (flags & PML_USERSPACE) && (flags & PML_WRITABLE) )
prot |= PROT_WRITE;
if ( !(flags & PML_NX) )
prot |= PROT_KEXEC;
if ( flags & PML_WRITABLE )
prot |= PROT_KWRITE;
2014-11-05 02:53:32 +00:00
if ( flags & PML_FORK )
prot |= PROT_FORK;
return prot;
}
int ProvidedProtection(int prot)
{
2014-11-05 02:53:32 +00:00
return PMLFlagsToProtection(ProtectionToPMLFlags(prot));
}
bool LookUp(addr_t mapto, addr_t* physical, int* protection)
{
// Translate the virtual address into PML indexes.
const size_t MASK = (1<<TRANSBITS)-1;
size_t pmlchildid[TOPPMLLEVEL + 1];
for ( size_t i = 1; i <= TOPPMLLEVEL; i++ )
pmlchildid[i] = mapto >> (12 + (i-1) * TRANSBITS) & MASK;
int prot = PROT_USER | PROT_KERNEL | PROT_FORK;
// For each PML level, make sure it exists.
size_t offset = 0;
for ( size_t i = TOPPMLLEVEL; i > 1; i-- )
{
size_t childid = pmlchildid[i];
PML* pml = PMLS[i] + offset;
addr_t entry = pml->entry[childid];
if ( !(entry & PML_PRESENT) )
return false;
2014-11-05 02:53:32 +00:00
addr_t entryflags = entry & ~PML_ADDRESS;
int entryprot = PMLFlagsToProtection(entryflags);
prot &= entryprot;
// Find the index of the next PML in the fractal mapped memory.
offset = offset * ENTRIES + childid;
}
addr_t entry = (PMLS[1] + offset)->entry[pmlchildid[1]];
if ( !(entry & PML_PRESENT) )
return false;
2014-11-05 02:53:32 +00:00
addr_t entryflags = entry & ~PML_ADDRESS;
int entryprot = PMLFlagsToProtection(entryflags);
prot &= entryprot;
addr_t phys = entry & PML_ADDRESS;
if ( physical )
*physical = phys;
if ( protection )
*protection = prot;
return true;
}
void InvalidatePage(addr_t /*addr*/)
{
// TODO: Actually just call the instruction.
Flush();
}
addr_t GetAddressSpace()
{
addr_t result;
asm ( "mov %%cr3, %0" : "=r"(result) );
return result;
}
addr_t SwitchAddressSpace(addr_t addrspace)
{
assert(Page::IsAligned(addrspace));
addr_t previous = GetAddressSpace();
asm volatile ( "mov %0, %%cr3" : : "r"(addrspace) );
return previous;
}
void Flush()
{
addr_t previous;
asm ( "mov %%cr3, %0" : "=r"(previous) );
asm volatile ( "mov %0, %%cr3" : : "r"(previous) );
}
bool MapRange(addr_t where, size_t bytes, int protection, enum page_usage usage)
{
for ( addr_t page = where; page < where + bytes; page += 4096UL )
{
addr_t physicalpage = Page::Get(usage);
if ( physicalpage == 0 )
{
while ( where < page )
{
page -= 4096UL;
physicalpage = Unmap(page);
Page::Put(physicalpage, usage);
}
return false;
}
Map(physicalpage, page, protection);
}
return true;
}
bool UnmapRange(addr_t where, size_t bytes, enum page_usage usage)
{
for ( addr_t page = where; page < where + bytes; page += 4096UL )
{
addr_t physicalpage = Unmap(page);
Page::Put(physicalpage, usage);
}
return true;
}
static bool MapInternal(addr_t physical, addr_t mapto, int prot, addr_t extraflags = 0)
{
addr_t flags = ProtectionToPMLFlags(prot) | PML_PRESENT;
// Translate the virtual address into PML indexes.
const size_t MASK = (1<<TRANSBITS)-1;
size_t pmlchildid[TOPPMLLEVEL + 1];
for ( size_t i = 1; i <= TOPPMLLEVEL; i++ )
pmlchildid[i] = mapto >> (12 + (i-1) * TRANSBITS) & MASK;
// For each PML level, make sure it exists.
size_t offset = 0;
for ( size_t i = TOPPMLLEVEL; i > 1; i-- )
{
size_t childid = pmlchildid[i];
PML* pml = PMLS[i] + offset;
addr_t& entry = pml->entry[childid];
// Find the index of the next PML in the fractal mapped memory.
size_t childoffset = offset * ENTRIES + childid;
if ( !(entry & PML_PRESENT) )
{
// TODO: Possible memory leak when page allocation fails.
addr_t page = Page::Get(PAGE_USAGE_PAGING_OVERHEAD);
if ( !page )
return false;
addr_t pmlflags = PML_PRESENT | PML_WRITABLE | PML_USERSPACE
| PML_FORK;
entry = page | pmlflags;
// Invalidate the new PML and reset it to zeroes.
addr_t pmladdr = (addr_t) (PMLS[i-1] + childoffset);
InvalidatePage(pmladdr);
memset((void*) pmladdr, 0, sizeof(PML));
}
offset = childoffset;
}
// Actually map the physical page to the virtual page.
const addr_t entry = physical | flags | extraflags;
(PMLS[1] + offset)->entry[pmlchildid[1]] = entry;
return true;
}
bool Map(addr_t physical, addr_t mapto, int prot)
{
return MapInternal(physical, mapto, prot);
}
void PageProtect(addr_t mapto, int protection)
{
addr_t phys;
if ( !LookUp(mapto, &phys, NULL) )
return;
Map(phys, mapto, protection);
}
void PageProtectAdd(addr_t mapto, int protection)
{
addr_t phys;
int prot;
if ( !LookUp(mapto, &phys, &prot) )
return;
prot |= protection;
Map(phys, mapto, prot);
}
void PageProtectSub(addr_t mapto, int protection)
{
addr_t phys;
int prot;
if ( !LookUp(mapto, &phys, &prot) )
return;
prot &= ~protection;
Map(phys, mapto, prot);
}
addr_t Unmap(addr_t mapto)
{
// Translate the virtual address into PML indexes.
const size_t MASK = (1<<TRANSBITS)-1;
size_t pmlchildid[TOPPMLLEVEL + 1];
for ( size_t i = 1; i <= TOPPMLLEVEL; i++ )
{
pmlchildid[i] = mapto >> (12 + (i-1) * TRANSBITS) & MASK;
}
// For each PML level, make sure it exists.
size_t offset = 0;
for ( size_t i = TOPPMLLEVEL; i > 1; i-- )
{
size_t childid = pmlchildid[i];
PML* pml = PMLS[i] + offset;
addr_t& entry = pml->entry[childid];
if ( !(entry & PML_PRESENT) )
PanicF("Attempted to unmap virtual page 0x%jX, but the virtual"
" page was wasn't mapped. This is a bug in the code "
"code calling this function", (uintmax_t) mapto);
// Find the index of the next PML in the fractal mapped memory.
offset = offset * ENTRIES + childid;
}
addr_t& entry = (PMLS[1] + offset)->entry[pmlchildid[1]];
addr_t result = entry & PML_ADDRESS;
entry = 0;
// TODO: If all the entries in PML[N] are not-present, then who
// unmaps its entry from PML[N-1]?
return result;
}
bool MapPAT(addr_t physical, addr_t mapto, int prot, addr_t mtype)
{
addr_t extraflags = PAT2PMLFlags[mtype];
return MapInternal(physical, mapto, prot, extraflags);
}
void ForkCleanup(size_t i, size_t level)
{
PML* destpml = FORKPML + level;
if ( !i )
return;
for ( size_t n = 0; n < i-1; n++ )
{
addr_t entry = destpml->entry[i];
if ( !(entry & PML_FORK ) )
continue;
addr_t phys = entry & PML_ADDRESS;
if ( 1 < level )
{
addr_t destaddr = (addr_t) (FORKPML + level-1);
Map(phys, destaddr, PROT_KREAD | PROT_KWRITE);
InvalidatePage(destaddr);
ForkCleanup(ENTRIES+1UL, level-1);
}
enum page_usage usage = 1 < level ? PAGE_USAGE_PAGING_OVERHEAD
: PAGE_USAGE_USER_SPACE;
Page::Put(phys, usage);
}
}
// TODO: Copying every frame is endlessly useless in many uses. It'd be
// nice to upgrade this to a copy-on-write algorithm.
bool Fork(size_t level, size_t pmloffset)
{
PML* destpml = FORKPML + level;
for ( size_t i = 0; i < ENTRIES; i++ )
{
addr_t entry = (PMLS[level] + pmloffset)->entry[i];
// Link the entry if it isn't supposed to be forked.
if ( !(entry & PML_PRESENT) || !(entry & PML_FORK ) )
{
destpml->entry[i] = entry;
continue;
}
enum page_usage usage = 1 < level ? PAGE_USAGE_PAGING_OVERHEAD
: PAGE_USAGE_USER_SPACE;
addr_t phys = Page::Get(usage);
if ( unlikely(!phys) )
{
ForkCleanup(i, level);
return false;
}
addr_t flags = entry & PML_FLAGS;
destpml->entry[i] = phys | flags;
// Map the destination page.
addr_t destaddr = (addr_t) (FORKPML + level-1);
Map(phys, destaddr, PROT_KREAD | PROT_KWRITE);
InvalidatePage(destaddr);
size_t offset = pmloffset * ENTRIES + i;
if ( 1 < level )
{
if ( !Fork(level-1, offset) )
{
Page::Put(phys, usage);
ForkCleanup(i, level);
return false;
}
continue;
}
// Determine the source page's address.
const void* src = (const void*) (offset * 4096UL);
// Determine the destination page's address.
void* dest = (void*) (FORKPML + level - 1);
memcpy(dest, src, 4096UL);
}
return true;
}
bool Fork(addr_t dir, size_t level, size_t pmloffset)
{
PML* destpml = FORKPML + level;
// This call always succeeds.
Map(dir, (addr_t) destpml, PROT_KREAD | PROT_KWRITE);
InvalidatePage((addr_t) destpml);
return Fork(level, pmloffset);
}
// Create an exact copy of the current address space.
addr_t Fork()
{
addr_t dir = Page::Get(PAGE_USAGE_PAGING_OVERHEAD);
if ( dir == 0 )
return 0;
if ( !Fork(dir, TOPPMLLEVEL, 0) )
{
Page::Put(dir, PAGE_USAGE_PAGING_OVERHEAD);
return 0;
}
// Now, the new top pml needs to have its fractal memory fixed.
const addr_t flags = PML_PRESENT | PML_WRITABLE;
addr_t mapto;
addr_t childaddr;
(FORKPML + TOPPMLLEVEL)->entry[ENTRIES-1] = dir | flags;
childaddr = (FORKPML + TOPPMLLEVEL)->entry[ENTRIES-2] & PML_ADDRESS;
for ( size_t i = TOPPMLLEVEL-1; i > 0; i-- )
{
mapto = (addr_t) (FORKPML + i);
Map(childaddr, mapto, PROT_KREAD | PROT_KWRITE);
InvalidatePage(mapto);
(FORKPML + i)->entry[ENTRIES-1] = dir | flags;
childaddr = (FORKPML + i)->entry[ENTRIES-2] & PML_ADDRESS;
}
return dir;
}
} // namespace Memory
} // namespace Sortix