aboutsummaryrefslogtreecommitdiff
path: root/kernel/mm
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/mm')
-rw-r--r--kernel/mm/memcheck.py158
-rw-r--r--kernel/mm/mobj.c313
-rw-r--r--kernel/mm/page.c658
-rw-r--r--kernel/mm/page.py47
-rw-r--r--kernel/mm/pagecache.c23
-rw-r--r--kernel/mm/pagetable.c873
-rw-r--r--kernel/mm/pagetable.gdb25
-rw-r--r--kernel/mm/pframe.c59
-rw-r--r--kernel/mm/slab.c550
-rw-r--r--kernel/mm/slab.py55
10 files changed, 2761 insertions, 0 deletions
diff --git a/kernel/mm/memcheck.py b/kernel/mm/memcheck.py
new file mode 100644
index 0000000..49f40fd
--- /dev/null
+++ b/kernel/mm/memcheck.py
@@ -0,0 +1,158 @@
+import gdb
+
+import string
+
+import weenix
+import weenix.kmem
+import weenix.stack
+
+
+class SlabAllocation:
+ def __init__(self, addr, stack, allocator, initialization):
+ self.addr = addr
+ self.stack = stack
+ self.allocator = allocator
+ self.initialization = initialization
+
+
+class PageAllocation:
+ def __init__(self, addr, stack, npages, slabdata, initialization):
+ self.addr = addr
+ self.stack = stack
+ self.npages = npages
+ self.slabdata = slabdata
+ self.initialization = initialization
+
+
+class Memcheck:
+ def __init__(self):
+ self._slab_alloc_count = 0
+ self._slab_free_count = 0
+ self._slab_invalid_free = 0
+ self._slab_allocated = {}
+ self._page_alloc_count = 0
+ self._page_free_count = 0
+ self._page_invalid_free = 0
+ self._page_allocated = {}
+ self._initialized = False
+ weenix.Hook("slab_obj_alloc", self._slab_alloc_callback)
+ weenix.Hook("slab_obj_free", self._slab_free_callback)
+ weenix.Hook("page_alloc", self._page_alloc_callback)
+ weenix.Hook("page_free", self._page_free_callback)
+ weenix.Hook("initialized", self._initialized_callback)
+ weenix.Hook("shutdown", self._shutdown_callback)
+
+ def _slab_alloc_callback(self, args):
+ addr = args["addr"]
+ if string.atol(addr, 16) == 0:
+ return False
+ stack = weenix.stack.Stack(gdb.newest_frame().older())
+ allocator = weenix.kmem.SlabAllocator(
+ gdb.Value(string.atol(args["allocator"].split(" ")[0], 16)).cast(
+ gdb.lookup_type("void").pointer()
+ )
+ )
+ self._slab_allocated[addr] = SlabAllocation(
+ addr, stack, allocator, not self._initialized
+ )
+ if self._initialized:
+ self._slab_alloc_count += 1
+ return False
+
+ def _slab_free_callback(self, args):
+ if not args["addr"] in self._slab_allocated:
+ self._slab_invalid_free += 1
+ print("Invalid free of address " + args["addr"] + ":")
+ print(weenix.stack.Stack(gdb.newest_frame().older()))
+ else:
+ if not self._slab_allocated[args["addr"]].initialization:
+ self._slab_free_count += 1
+ del self._slab_allocated[args["addr"]]
+ return False
+
+ def _page_alloc_callback(self, args):
+ addr = args["addr"]
+ if string.atol(addr, 16) == 0:
+ return False
+ stack = weenix.stack.Stack(gdb.newest_frame().older())
+ slabdata = stack.contains("_slab_allocator_grow")
+ self._page_allocated[addr] = PageAllocation(
+ addr, stack, string.atoi(args["npages"]), slabdata, not self._initialized
+ )
+ if self._initialized and not slabdata:
+ self._page_alloc_count += 1
+ return False
+
+ def _page_free_callback(self, args):
+ if not args["addr"] in self._page_allocated:
+ self._page_invalid_free += 1
+ print("Invalid free of address " + args["addr"] + ":")
+ print(weenix.stack.Stack(gdb.newest_frame().older()))
+ elif self._page_allocated[args["addr"]].npages != string.atoi(args["npages"]):
+ self._page_invalid_free += 1
+ print(
+ "Address "
+ + args["addr"]
+ + " allocated for "
+ + str(self._page_allocated[args["addr"]].npages)
+ + " pages:"
+ )
+ print(self._page_allocated[args["addr"]].stack)
+ print("but freed with " + args["npages"] + " pages:")
+ print(weenix.stack.Stack(gdb.newest_frame().older()))
+ del self._page_allocated[args["addr"]]
+ else:
+ if (
+ not self._page_allocated[args["addr"]].initialization
+ and not self._page_allocated[args["addr"]].slabdata
+ ):
+ self._page_free_count += 1
+ del self._page_allocated[args["addr"]]
+ return False
+
+ def _initialized_callback(self, args):
+ self._initialized = True
+ return False
+
+ def _shutdown_callback(self, args):
+ size = 0
+ for alloc in self._slab_allocated.itervalues():
+ if not alloc.initialization:
+ size += alloc.allocator.size()
+ print(
+ 'Leaked {0} bytes from "{1}" at {2}:'.format(
+ alloc.allocator.size(), alloc.allocator.name(), alloc.addr
+ )
+ )
+ print(alloc.stack)
+ npages = 0
+ for page in self._page_allocated.itervalues():
+ if not page.initialization and not page.slabdata:
+ npages += page.npages
+ print("Leaked {0} pages at {1}:".format(page.npages, page.addr))
+ print(page.stack)
+ print(
+ "{0} slab allocs, {1} frees ({2} bytes leaked)".format(
+ self._slab_alloc_count, self._slab_free_count, size
+ )
+ )
+ print(
+ "{0} page allocs, {1} frees ({2} pages leaked)".format(
+ self._page_alloc_count, self._page_free_count, npages
+ )
+ )
+ print("{0} invalid slab frees".format(self._slab_invalid_free))
+ print("{0} invalid page frees".format(self._page_invalid_free))
+ return False
+
+
+class MemcheckFlag(weenix.Flag):
+ def __init__(self):
+ weenix.Flag.__init__(self, "memcheck", gdb.COMMAND_DATA)
+
+ def callback(self, value):
+ if value:
+ Memcheck()
+
+
+MemcheckFlag()
diff --git a/kernel/mm/mobj.c b/kernel/mm/mobj.c
new file mode 100644
index 0000000..4b9c80f
--- /dev/null
+++ b/kernel/mm/mobj.c
@@ -0,0 +1,313 @@
+#include "errno.h"
+
+#include "mm/mobj.h"
+#include "mm/pframe.h"
+
+#include "util/debug.h"
+#include <util/string.h>
+
+/*
+ * Initialize o according to type and ops. If ops do not specify a
+ * get_pframe function, set it to the default, mobj_default_get_pframe.
+ * Do the same with the destructor function pointer.
+ *
+ * Upon return, the refcount of the mobj should be 1.
+ */
+void mobj_init(mobj_t *o, long type, mobj_ops_t *ops)
+{
+ o->mo_type = type;
+
+ memcpy(&o->mo_ops, ops, sizeof(mobj_ops_t));
+
+ if (!o->mo_ops.get_pframe)
+ {
+ o->mo_ops.get_pframe = mobj_default_get_pframe;
+ KASSERT(o->mo_ops.fill_pframe);
+ KASSERT(o->mo_ops.flush_pframe);
+ }
+ if (!o->mo_ops.destructor)
+ {
+ o->mo_ops.destructor = mobj_default_destructor;
+ }
+
+ kmutex_init(&o->mo_mutex);
+
+ o->mo_refcount = ATOMIC_INIT(1);
+ list_init(&o->mo_pframes);
+}
+
+/*
+ * Lock the mobj's mutex
+ */
+inline void mobj_lock(mobj_t *o) { kmutex_lock(&o->mo_mutex); }
+
+/*
+ * Unlock the mobj's mutex
+ */
+inline void mobj_unlock(mobj_t *o) { kmutex_unlock(&o->mo_mutex); }
+
+/*
+ * Increment refcount
+ */
+void mobj_ref(mobj_t *o)
+{
+ atomic_inc(&o->mo_refcount);
+}
+
+void mobj_put_locked(mobj_t **op)
+{
+ mobj_unlock(*op);
+ mobj_put(op);
+}
+
+/*
+ * Decrement refcount, and set *op = NULL.
+ * If the refcount drop to 0, call the destructor, otherwise unlock the mobj.
+ */
+void mobj_put(mobj_t **op)
+{
+ mobj_t *o = *op;
+ KASSERT(o->mo_refcount);
+ *op = NULL;
+
+ dbg(DBG_ERROR, "count: %d\n", o->mo_refcount);
+ if (atomic_dec_and_test(&o->mo_refcount))
+ {
+ dbg(DBG_ERROR, "count: %d\n", o->mo_refcount);
+
+ KASSERT(!kmutex_owns_mutex(&o->mo_mutex));
+ o->mo_ops.destructor(o);
+ }
+ else
+ {
+ dbg(DBG_ERROR, "count: %d\n", o->mo_refcount);
+ }
+}
+
+/*
+ * Find a pframe that already exists in the memory object's mo_pframes list.
+ * If a pframe is found, it must be locked upon return from this function using
+ * pf_mutex.
+ */
+void mobj_find_pframe(mobj_t *o, uint64_t pagenum, pframe_t **pfp)
+{
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ list_iterate(&o->mo_pframes, pf, pframe_t, pf_link)
+ {
+ if (pf->pf_pagenum == pagenum)
+ {
+ kmutex_lock(&pf->pf_mutex);
+ *pfp = pf;
+ return;
+ }
+ }
+ *pfp = NULL;
+}
+
+/*
+ * Wrapper around the memory object's get_pframe function
+ * Assert a sane state of the world surrounding the call to get_pframe
+ */
+long mobj_get_pframe(mobj_t *o, uint64_t pagenum, long forwrite,
+ pframe_t **pfp)
+{
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ *pfp = NULL;
+ long ret = o->mo_ops.get_pframe(o, pagenum, forwrite, pfp);
+ KASSERT((!*pfp && ret) || kmutex_owns_mutex(&(*pfp)->pf_mutex));
+ return ret;
+}
+
+/*
+ * Create and initialize a pframe and add it to the mobj's mo_pframes list.
+ * Upon successful return, the pframe's pf_mutex is locked.
+ */
+#ifdef OLD
+static void mobj_create_pframe(mobj_t *o, uint64_t pagenum, pframe_t **pfp)
+#endif
+void mobj_create_pframe(mobj_t *o, uint64_t pagenum, uint64_t loc, pframe_t **pfp)
+{
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ pframe_t *pf = pframe_create();
+ if (pf)
+ {
+ kmutex_lock(&pf->pf_mutex);
+
+ pf->pf_pagenum = pagenum;
+ pf->pf_loc = loc;
+ list_insert_tail(&o->mo_pframes, &pf->pf_link);
+ }
+ KASSERT(!pf || kmutex_owns_mutex(&pf->pf_mutex));
+ *pfp = pf;
+}
+
+/*
+ * The default get pframe that is at the center of the mobj/pframe subsystem.
+ * This is the routine that is used when the memory object does not have a
+ * get_pframe function associated with it (or called in the case of shadow objects
+ * when the forwrite flag is set).
+ *
+ * First, check if an pframe already exists in the mobj, creating one as
+ * necessary. Then, ensure that the pframe's contents are loaded: i.e. that
+ * pf->pf_addr is non-null. You will want to use page_alloc() and fill_pframe
+ * function pointer of the mobj. Finally, if forwrite is true, mark the pframe
+ * as dirtied. The resulting pframe should be set in *pfp.
+ *
+ * Note that upon failure, *pfp MUST be null. As always, make sure you cleanup
+ * properly in all error cases (especially if fill_prame fails)
+ *
+ * Upon successful return, *pfp refers to the found pframe and MUST be locked.
+ *
+ * Error cases mobj_default_get_pframe is responsible for generating:
+ * - ENOMEM: either cannot create the pframe or cannot allocate memory for
+ * the pframe's contents
+ */
+long mobj_default_get_pframe(mobj_t *o, uint64_t pagenum, long forwrite,
+ pframe_t **pfp)
+{
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ *pfp = NULL;
+ pframe_t *pf = NULL;
+ mobj_find_pframe(o, pagenum, &pf);
+ if (!pf)
+ {
+ mobj_create_pframe(o, pagenum, 0, &pf); // XXX is zero correct???
+ }
+ if (!pf)
+ {
+ return -ENOMEM;
+ }
+ KASSERT(kmutex_owns_mutex(&pf->pf_mutex));
+ if (!pf->pf_addr)
+ {
+ KASSERT(!pf->pf_dirty &&
+ "dirtied page doesn't have a physical address");
+ pf->pf_addr = page_alloc();
+ if (!pf->pf_addr)
+ {
+ return -ENOMEM;
+ }
+
+ dbg(DBG_PFRAME, "filling pframe 0x%p (mobj 0x%p page %lu)\n", pf, o,
+ pf->pf_pagenum);
+ KASSERT(o->mo_ops.fill_pframe);
+ long ret = o->mo_ops.fill_pframe(o, pf);
+ if (ret)
+ {
+ page_free(pf->pf_addr);
+ pf->pf_addr = NULL;
+ kmutex_unlock(&pf->pf_mutex);
+ return ret;
+ }
+ }
+ pf->pf_dirty |= forwrite;
+ *pfp = pf;
+ return 0;
+}
+
+/*
+ * If the pframe is dirty, call the mobj's flush_pframe; if flush_pframe returns
+ * successfully, clear pf_dirty flag and return 0. Otherwise, return what
+ * flush_pframe returned.
+ *
+ * Both o and pf must be locked when calling this function
+ */
+long mobj_flush_pframe(mobj_t *o, pframe_t *pf)
+{
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ KASSERT(kmutex_owns_mutex(&pf->pf_mutex));
+ KASSERT(pf->pf_addr && "cannot flush a frame not in memory!");
+ dbg(DBG_PFRAME, "pf 0x%p, mobj 0x%p, page %lu\n", pf, o, pf->pf_pagenum);
+ if (pf->pf_dirty)
+ {
+ KASSERT(o->mo_ops.flush_pframe);
+ long ret = o->mo_ops.flush_pframe(o, pf);
+ if (ret)
+ return ret;
+ pf->pf_dirty = 0;
+ }
+ KASSERT(!pf->pf_dirty);
+ return 0;
+}
+
+/*
+ * Iterate through the pframes of the mobj and try to flush each one.
+ * If any of them fail, let that reflect in the return value.
+ *
+ * The mobj o must be locked when calling this function
+ */
+long mobj_flush(mobj_t *o)
+{
+ long ret = 0;
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+ list_iterate(&o->mo_pframes, pf, pframe_t, pf_link)
+ {
+ kmutex_lock(&pf->pf_mutex); // get the pframe (lock it)
+ if (pf->pf_addr)
+ {
+ ret |= mobj_flush_pframe(o, pf);
+ }
+ pframe_release(&pf);
+ }
+ return ret;
+}
+
+/*
+ * Attempt to flush the pframe. If the flush succeeds, then free the pframe's
+ * contents (pf->pf_addr) using page_free, remove the pframe from the mobj's
+ * list and call pframe_free.
+ *
+ * Upon successful return, *pfp MUST be null. If the function returns an error
+ * code, *pfp must be unchanged.
+ */
+long mobj_free_pframe(mobj_t *o, pframe_t **pfp)
+{
+ pframe_t *pf = *pfp;
+
+ if (pf->pf_addr)
+ {
+ long ret = mobj_flush_pframe(o, pf);
+ if (ret)
+ return ret;
+
+ // [+] TODO REMOVE THIS SECTION WHEN FLUSH DOES IT (I.E. WHEN WE HAVE
+ // SUPPORT FOR FREEING PFRAME'S IN USE BY UNMAPPING THEM FROM PAGE
+ // TABLES THAT USE THEM)
+ if (pf->pf_addr)
+ {
+ page_free(pf->pf_addr);
+ pf->pf_addr = NULL;
+ }
+ }
+ *pfp = NULL;
+ list_remove(&pf->pf_link);
+ pframe_free(&pf);
+ return 0;
+}
+
+/*
+ * Simply flush the memory object
+ */
+void mobj_default_destructor(mobj_t *o)
+{
+ mobj_lock(o);
+ KASSERT(kmutex_owns_mutex(&o->mo_mutex));
+
+ long ret = 0;
+ list_iterate(&o->mo_pframes, pf, pframe_t, pf_link)
+ {
+ kmutex_lock(&pf->pf_mutex); // get the pframe (lock it)
+ ret |= mobj_free_pframe(o, &pf);
+ }
+
+ if (ret)
+ {
+ dbg(DBG_MM,
+ "WARNING: flushing pframes in mobj destructor failed for one or "
+ "more frames\n"
+ "This means the memory for the pframe will be leaked!");
+ }
+
+ KASSERT(!kmutex_has_waiters(&o->mo_mutex));
+ mobj_unlock(o);
+}
diff --git a/kernel/mm/page.c b/kernel/mm/page.c
new file mode 100644
index 0000000..b42dca4
--- /dev/null
+++ b/kernel/mm/page.c
@@ -0,0 +1,658 @@
+// SMP.1 + SMP.3
+// spinlocks + mask interrupts
+#include "kernel.h"
+#include "types.h"
+#include <boot/multiboot_macros.h>
+
+#include "boot/config.h"
+
+#include "mm/mm.h"
+#include "mm/page.h"
+
+#include "util/debug.h"
+#include "util/gdb.h"
+#include "util/string.h"
+
+#include "multiboot.h"
+
+// BTREE === Binary Tree (not an actual B-Tree)
+
+// Algorithmic optimization ideas
+// have a "min free idx" pointer for each order (have a "count free" at each
+// order) delay cascading availability bits up the tree until needed; would
+// prevent state "thrashing"
+// can do this with a cascaded_order flag that equals the highest order
+// which we have cascaed up to. For a given allocation, if the required
+// order is > cascaded_order, then we cascade up to the required order
+
+// get ready for bit manipulation heaven :)
+
+typedef uintptr_t btree_word;
+
+#define BTREE_ROW_START_INDEX(order) \
+ (((uintptr_t)1 << (max_order - (order))) - 1)
+#define BTREE_ROW_END_INDEX(order) ((BTREE_ROW_START_INDEX(order) << 1) | 1)
+#define BTREE_INDEX_TO_ADDR(idx, order) \
+ (((1 << (order)) * ((idx)-BTREE_ROW_START_INDEX(order))) << PAGE_SHIFT)
+#define BTREE_ADDR_TO_INDEX(addr, order) \
+ (BTREE_ROW_START_INDEX(order) + \
+ ((((uintptr_t)(addr)) >> PAGE_SHIFT) / (1 << (order))))
+
+#define BTREE_LEAF_START_INDEX BTREE_ROW_START_INDEX(0)
+#define BTREE_ADDR_TO_LEAF_INDEX(addr) BTREE_ADDR_TO_INDEX(addr, 0)
+#define BTREE_LEAF_INDEX_TO_ADDR(idx) BTREE_INDEX_TO_ADDR(idx, 0)
+
+#define BTREE_NUM_BITS (sizeof(btree_word) << 3)
+#define BTREE_WORD_POS(idx) ((idx) / BTREE_NUM_BITS)
+#define BTREE_BIT_POS(idx) ((idx) & (BTREE_NUM_BITS - 1))
+#define BTREE_AVAILABILITY_MASK(idx) \
+ ((uintptr_t)1 << (BTREE_NUM_BITS - 1 - BTREE_BIT_POS(idx)))
+
+// we really don't want branching here (predictor would be quite bad and
+// branches are slowwwww)
+#define BTREE_SIBLING(idx) ((idx)-1 + (((idx)&1) << 1))
+// uintptr_t btree_sibling(uintptr_t idx) {
+// // in a 0-indexed binary tree, a sibling of an odd node is its right
+// neighbor --> add 1
+// // and the sibling of an even node is its left neighbor --> subtract 1
+// // so we need: (idx % 2) ? (idx + 1) : (idx - 1);
+// uintptr_t odd_addend = idx & 1; // 1 if odd, 0 if even
+// uintptr_t even_addend = (uintptr_t) -1 + odd_addend; // 0 if odd, -1 if
+// even return idx + odd_addend + even_addend; return idx + (idx & 1) +
+// ((uintptr_t) -1 + (idx & 1)); return idx - 1 + ((idx & 1) << 1);
+// // now it looks like: always subtract 1, add 2 if odd. which works :)
+// }
+
+// get the left sibling (odd) of a pair; idx may already be the left sibling or
+// may be the right sibling (even) subtract 1 from idx if it's even --> subtract
+// 1 from LSB and add it back in
+#define BTREE_LEFT_SIBLING(idx) ((idx) + (((idx)&1) - 1))
+
+#define BTREE_PARENT(idx) (((idx)-1) >> 1)
+#define BTREE_LEFT_CHILD(idx) (((idx) << 1) + 1)
+#define BTREE_RIGHT_CHILD(idx) (((idx) + 1) << 1)
+#define BTREE_IS_LEFT_CHILD(idx) ((idx)&1)
+#define BTREE_IS_RIGHT_CHILD(idx) (!BTREE_IS_LEFT_CHILD(idx))
+
+#define BTREE_IS_AVAILABLE(idx) \
+ (btree[BTREE_WORD_POS(idx)] & BTREE_AVAILABILITY_MASK(idx))
+#define BTREE_MARK_AVAILABLE(idx) \
+ (btree[BTREE_WORD_POS(idx)] |= BTREE_AVAILABILITY_MASK(idx))
+#define BTREE_MARK_UNAVAILABLE(idx) \
+ (btree[BTREE_WORD_POS(idx)] &= ~BTREE_AVAILABILITY_MASK(idx))
+
+// potential optimization: use these when clearing pairs. something about the
+// following is apparently buggy though (causes fault) #define
+// BTREE_SIBLING_AVAILABILITY_MASK(idx) (BTREE_AVAILABILITY_MASK(idx) |
+// BTREE_IS_AVAILABLE(BTREE_SIBLING(idx))) #define
+// BTREE_MARK_SIBLINGS_AVAILABLE(idx) (btree[BTREE_WORD_POS(idx)] |=
+// BTREE_SIBLING_AVAILABILITY_MASK(idx)) #define
+// BTREE_MARK_SIBLINGS_UNAVAILABLE(idx) (btree[BTREE_WORD_POS(idx)] &=
+// ~BTREE_SIBLING_AVAILABILITY_MASK(idx))
+
+GDB_DEFINE_HOOK(page_alloc, void *addr, size_t npages)
+
+GDB_DEFINE_HOOK(page_free, void *addr, size_t npages)
+
+static size_t page_freecount;
+
+// if you rename these variables, update them in the macros above
+static size_t
+ max_pages; // max number of pages as determined by RAM, NOT max_order
+static size_t max_order; // max depth of binary tree
+
+static btree_word *btree;
+static uintptr_t *min_available_idx_by_order;
+static size_t *count_available_by_order;
+
+static char *type_strings[] = {"ERROR: type = 0", "Available", "Reserved",
+ "ACPI Reclaimable", "ACPI NVS", "GRUB Bad Ram"};
+static size_t type_count = sizeof(type_strings) / sizeof(type_strings[0]);
+
+inline void *physmap_start() { return (void *)PHYS_OFFSET; }
+
+inline void *physmap_end()
+{
+ return (void *)(PHYS_OFFSET + (max_pages << PAGE_SHIFT));
+}
+
+#undef DEBUG_PHYSICAL_PAGING
+
+static inline void _btree_expensive_sanity_check()
+{
+#ifdef DEBUG_PHYSICAL_PAGING
+ size_t available = 0;
+ for (unsigned order = 0; order <= max_order; order++)
+ {
+ long checked_first = 0;
+ unsigned order_count = 0;
+ uintptr_t max = BTREE_ROW_END_INDEX(order);
+
+ for (uintptr_t idx = BTREE_ROW_START_INDEX(order); idx < max; idx++)
+ {
+ if (BTREE_IS_AVAILABLE(idx))
+ {
+ if (!checked_first)
+ {
+ KASSERT(min_available_idx_by_order[order] == idx);
+ checked_first = 1;
+ }
+ available += (1 << order);
+ order_count++;
+ KASSERT(BTREE_INDEX_TO_ADDR(idx + 1, order) <= physmap_end());
+ }
+ }
+ if (!checked_first)
+ {
+ KASSERT(min_available_idx_by_order[order] == max);
+ }
+ KASSERT(count_available_by_order[order] == order_count);
+ }
+ KASSERT(available == page_freecount);
+#endif
+}
+
+void page_init()
+{
+ uintptr_t ram = 0;
+ uintptr_t memory_available_for_use = 0;
+
+ // detect amount of RAM and memory available for use immediately after
+ // kernel before any reserved region
+
+ KASSERT(PAGE_ALIGNED(mb_tag) && (uintptr_t)mb_tag == KERNEL_PHYS_END);
+
+ for (struct multiboot_tag *tag = mb_tag + 1;
+ tag->type != MULTIBOOT_TAG_TYPE_END; tag += TAG_SIZE(tag->size))
+ {
+ if (tag->type != MULTIBOOT_TAG_TYPE_MMAP)
+ {
+ continue;
+ }
+ struct multiboot_tag_mmap *mmap = (struct multiboot_tag_mmap *)tag;
+ dbg(DBG_PAGEALLOC, "Physical memory map (%d entries):\n",
+ mmap->size / mmap->entry_size);
+ for (unsigned i = 0; i < mmap->size / mmap->entry_size; i++)
+ {
+ struct multiboot_mmap_entry *entry = &mmap->entries[i];
+ dbgq(DBG_MM, "\t[0x%p-0x%p) (%llu bytes): %s\n",
+ (void *)entry->addr, (void *)(entry->addr + entry->len),
+ entry->len,
+ entry->type < type_count ? type_strings[entry->type]
+ : "Unknown");
+ if (entry->type != MULTIBOOT_MEMORY_AVAILABLE)
+ {
+ continue;
+ }
+
+ if (entry->addr < KERNEL_PHYS_END &&
+ entry->addr + entry->len > KERNEL_PHYS_END)
+ {
+ memory_available_for_use =
+ entry->addr + entry->len - KERNEL_PHYS_END;
+ }
+
+ if (entry->addr + entry->len > ram)
+ {
+ ram = entry->addr + entry->len;
+ }
+ }
+ }
+
+ // check we have enough space available following the kernel to map in all
+ // of RAM detected
+ max_pages = ram >> PAGE_SHIFT;
+ max_order = 0;
+ size_t npages = max_pages;
+ while (npages)
+ {
+ max_order++;
+ npages >>= 1;
+ }
+
+ // we may have too much RAM than we can map in with the single memory holy
+ // after the kernel keep shrinking the maximum order until we find a size
+ // that fits (this can obviously be done more intelligently, but this also
+ // works)
+ size_t btree_size;
+ size_t metadata_size;
+ while (max_order)
+ {
+ // we need 2^(max_order+1) pages, and one byte maps 8 pages, so we need
+ // 2^(max_order-2) bytes for the binary tree
+ btree_size = 1UL << (max_order - 2);
+ metadata_size = sizeof(uintptr_t) * (max_order + 1) +
+ sizeof(size_t) * (max_order + 1);
+
+ if (memory_available_for_use >= btree_size + metadata_size)
+ {
+ break;
+ }
+ if (max_pages ==
+ (ram >> PAGE_SHIFT))
+ { // only print first time we shrink
+ dbg(DBG_PAGEALLOC,
+ "Warning! Need 0x%p B of memory to map in 0x%p B of RAM, but "
+ "only have 0x%p available!",
+ (void *)(btree_size + metadata_size), (void *)ram,
+ (void *)memory_available_for_use);
+ }
+ max_order--;
+ max_pages = 1UL << max_order;
+ }
+ if (max_pages !=
+ (ram >> PAGE_SHIFT))
+ { // only print if we shrank available RAM
+ dbg(DBG_PAGEALLOC, "Supporting only up to 0x%p B of RAM!",
+ (void *)(max_pages << PAGE_SHIFT));
+ }
+
+ btree = (btree_word
+ *)(KERNEL_PHYS_END +
+ PAGE_SIZE); // 1 page padding for the multiboot information
+ memset(btree, 0, btree_size);
+
+ min_available_idx_by_order = (uintptr_t *)((uintptr_t)btree + btree_size);
+ for (unsigned order = 0; order <= max_order; order++)
+ {
+ min_available_idx_by_order[order] = BTREE_ROW_END_INDEX(order);
+ }
+
+ count_available_by_order =
+ min_available_idx_by_order + sizeof(uintptr_t) * (max_order + 1);
+ memset(count_available_by_order, 0, sizeof(size_t) * (max_order + 1));
+
+ page_freecount = 0;
+
+ uintptr_t reserved_ram_start = KERNEL_PHYS_BASE;
+ uintptr_t reserved_ram_end =
+ KERNEL_PHYS_END + PAGE_SIZE + btree_size + metadata_size;
+
+ for (struct multiboot_tag *tag = mb_tag + 1;
+ tag->type != MULTIBOOT_TAG_TYPE_END; tag += TAG_SIZE(tag->size))
+ {
+ if (tag->type != MULTIBOOT_TAG_TYPE_MMAP)
+ {
+ continue;
+ }
+ struct multiboot_tag_mmap *mmap = (struct multiboot_tag_mmap *)tag;
+ for (unsigned i = 0; i < mmap->size / mmap->entry_size; i++)
+ {
+ struct multiboot_mmap_entry *entry = &mmap->entries[i];
+ if (entry->type != MULTIBOOT_MEMORY_AVAILABLE)
+ {
+ continue;
+ }
+ uintptr_t addr = entry->addr;
+ uintptr_t len = entry->len;
+
+ if (addr >= reserved_ram_start && addr < reserved_ram_end)
+ {
+ if (len <= reserved_ram_end - addr)
+ {
+ continue;
+ }
+ len -= reserved_ram_end - addr;
+ addr = reserved_ram_end;
+ }
+ if (addr < reserved_ram_start && addr + len > reserved_ram_start)
+ {
+ len = reserved_ram_start - addr;
+ }
+
+ // TODO [+] see why removing this crashes SMP
+ if (addr < reserved_ram_start)
+ {
+ continue;
+ }
+
+ page_add_range((void *)addr, (void *)(addr + len));
+ }
+ }
+
+ page_mark_reserved(0); // don't allocate the first page of memory
+
+ size_t bytes = page_freecount << PAGE_SHIFT;
+ size_t gigabytes = (bytes >> 30);
+ bytes -= (gigabytes << 30);
+ size_t megabytes = (bytes >> 20);
+ bytes -= (megabytes << 20);
+ size_t kilobytes = (bytes >> 10);
+ bytes -= (kilobytes << 10);
+ KASSERT(bytes == 0);
+
+ dbg(DBG_PAGEALLOC,
+ "Amount of physical memory available for use: %lu GB, %lu MB, and %lu "
+ "KB; [0x%p, 0x%p)\n",
+ gigabytes, megabytes, kilobytes, physmap_start(), physmap_end());
+ _btree_expensive_sanity_check();
+}
+
+void page_init_finish()
+{
+ btree = (btree_word *)((uintptr_t)btree + PHYS_OFFSET);
+ min_available_idx_by_order =
+ (uintptr_t *)((uintptr_t)min_available_idx_by_order + PHYS_OFFSET);
+ count_available_by_order =
+ (uintptr_t *)((uintptr_t)count_available_by_order + PHYS_OFFSET);
+}
+
+static void _btree_update_metadata_after_removal(size_t order, size_t idx)
+{
+ // [+] TODO Intel-specific optimizations, see BSF, BSR, REPE CMPS/SCAS
+ if (count_available_by_order[order])
+ {
+ if (idx == min_available_idx_by_order[order])
+ {
+ uintptr_t word_idx = BTREE_WORD_POS(idx);
+ if (btree[word_idx] &&
+ word_idx == BTREE_WORD_POS(BTREE_ROW_START_INDEX(order)))
+ {
+ // mask off bits to the left of BTREE_BIT_POS(idx); i.e.
+ // consider only positions > than BTREE_BIT_POS(idx) in
+ // btree[word_idx] when idx is the old index of the first
+ // available node for the given order. This is to avoid setting
+ // min available for an order x to be an index that actually
+ // belongs to order (x + 1) (in the row above).
+ btree_word copy =
+ btree[word_idx] &
+ ((1UL << (BTREE_NUM_BITS - BTREE_BIT_POS(idx))) - 1);
+ unsigned bit_idx = BTREE_NUM_BITS;
+ while (copy != 0 && bit_idx > BTREE_BIT_POS(idx))
+ {
+ bit_idx--;
+ copy = copy >> 1;
+ }
+ if (BTREE_IS_AVAILABLE(word_idx * BTREE_NUM_BITS + bit_idx))
+ {
+ min_available_idx_by_order[order] =
+ word_idx * BTREE_NUM_BITS + bit_idx;
+ return;
+ }
+ word_idx++;
+ }
+ while (!btree[word_idx])
+ word_idx++;
+ btree_word copy = btree[word_idx];
+ unsigned bit_idx = BTREE_NUM_BITS;
+ while (copy != 0)
+ {
+ bit_idx--;
+ copy = copy >> 1;
+ }
+ uintptr_t min_available = word_idx * BTREE_NUM_BITS + bit_idx;
+ if (min_available > BTREE_ROW_END_INDEX(order))
+ {
+ min_available = BTREE_ROW_END_INDEX(order);
+ }
+ min_available_idx_by_order[order] = min_available;
+ }
+ }
+ else
+ {
+ min_available_idx_by_order[order] = BTREE_ROW_END_INDEX(order);
+ }
+}
+
+static void _btree_mark_available(uintptr_t idx, size_t order)
+{
+ KASSERT(!BTREE_IS_AVAILABLE(idx));
+ BTREE_MARK_AVAILABLE(idx);
+
+ uintptr_t start = BTREE_INDEX_TO_ADDR(idx, order);
+ uintptr_t end = BTREE_INDEX_TO_ADDR(idx + 1, order);
+ dbg(DBG_MM, "marking available (0x%p, 0x%p)\n", (void *)start, (void *)end);
+ KASSERT(!(0xb1000 >= start && 0xb1000 < end));
+
+ count_available_by_order[order]++;
+ if (idx < min_available_idx_by_order[order])
+ {
+ min_available_idx_by_order[order] = idx;
+ }
+
+ while (idx > 0 && BTREE_IS_AVAILABLE(BTREE_SIBLING(idx)))
+ {
+ BTREE_MARK_UNAVAILABLE(idx);
+ BTREE_MARK_UNAVAILABLE(BTREE_SIBLING(idx));
+
+ count_available_by_order[order] -= 2;
+ _btree_update_metadata_after_removal(order, BTREE_LEFT_SIBLING(idx));
+
+ idx = BTREE_PARENT(idx);
+ order++;
+ BTREE_MARK_AVAILABLE(idx);
+ count_available_by_order[order]++;
+ if (idx < min_available_idx_by_order[order])
+ {
+ min_available_idx_by_order[order] = idx;
+ }
+ }
+}
+
+static void _btree_mark_range_available(uintptr_t leaf_idx, size_t npages)
+{
+ // coult be optimized further so that we don't need to keep traversing fromm
+ // leaf to max order. can instead jump to parent's (right) sibling when
+ // we are a right child, and by jumping to left child while npages > what
+ // would be allocated but for now, this works and is fast enough it seems...
+ // TODO potential optimization
+ while (npages)
+ {
+ uintptr_t idx = leaf_idx;
+ size_t order = 0;
+ while (BTREE_IS_LEFT_CHILD(idx) && (2UL << order) <= npages)
+ {
+ idx = BTREE_PARENT(idx);
+ order++;
+ }
+ _btree_mark_available(idx, order);
+ npages -= 1 << order;
+ leaf_idx += 1 << order;
+ }
+}
+
+void page_add_range(void *start, void *end)
+{
+ dbg(DBG_MM, "Page system adding range [0x%p, 0x%p)\n", start, end);
+ KASSERT(end > start);
+ if (start == 0)
+ {
+ start = PAGE_ALIGN_UP(1);
+ if (end <= start)
+ {
+ return;
+ }
+ }
+ start = PAGE_ALIGN_UP(start);
+ end = PAGE_ALIGN_DOWN(end);
+ size_t npages = ((uintptr_t)end - (uintptr_t)start) >> PAGE_SHIFT;
+ _btree_mark_range_available(BTREE_ADDR_TO_LEAF_INDEX(start), npages);
+ page_freecount += npages;
+ _btree_expensive_sanity_check();
+}
+
+void *page_alloc() { return page_alloc_n(1); }
+
+void *page_alloc_bounded(void *max_paddr)
+{
+ return page_alloc_n_bounded(1, max_paddr);
+}
+
+void page_free(void *addr) { page_free_n(addr, 1); }
+
+static void *_btree_alloc(size_t npages, uintptr_t idx, size_t smallest_order,
+ size_t actual_order)
+{
+ while (actual_order != smallest_order)
+ {
+ BTREE_MARK_UNAVAILABLE(idx);
+ count_available_by_order[actual_order]--;
+ _btree_update_metadata_after_removal(actual_order, idx);
+
+ idx = BTREE_LEFT_CHILD(idx);
+ BTREE_MARK_AVAILABLE(idx);
+ BTREE_MARK_AVAILABLE(BTREE_SIBLING(idx));
+ actual_order--;
+
+ count_available_by_order[actual_order] += 2;
+ if (idx < min_available_idx_by_order[actual_order])
+ {
+ min_available_idx_by_order[actual_order] = idx;
+ }
+ _btree_expensive_sanity_check();
+ }
+
+ // actually allocate the 2^smallest_order pages by marking them unavailable
+ BTREE_MARK_UNAVAILABLE(idx);
+ count_available_by_order[actual_order]--;
+ _btree_update_metadata_after_removal(actual_order, idx);
+
+ uintptr_t allocated_idx = idx;
+ size_t allocated_order = actual_order;
+ while (allocated_order-- > 0)
+ allocated_idx = BTREE_LEFT_CHILD(allocated_idx);
+
+ KASSERT(BTREE_LEAF_INDEX_TO_ADDR(allocated_idx));
+
+ // we allocated some 2^smallest_order of pages; it is possible they asked
+ // for fewer than 2^smallest_order pages; make sure we mark as available the
+ // remaining (2^smallest_order - npages) pages.
+ _btree_mark_range_available(allocated_idx + npages,
+ (1 << smallest_order) - npages);
+
+ // while (over_allocated > 0 && (1 << reclaimed_order) < over_allocated
+ // && next_leaf_to_reclaim < max_reclaim_idx) {
+ // BTREE_MARK_AVAILABLE(idx);
+ // count_available_by_order[reclaimed_order]++;
+ // if (idx < min_available_idx_by_order[reclaimed_order]) {
+ // min_available_idx_by_order[reclaimed_order] = idx;
+ // }
+ // over_allocated -= (1 << reclaimed_order);
+ // next_leaf_to_reclaim += (2 << reclaimed_order);
+ // idx = BTREE_SIBLING(BTREE_PARENT(idx));
+ // reclaimed_order++;
+ // }
+
+ page_freecount -= npages;
+
+ uintptr_t addr = BTREE_LEAF_INDEX_TO_ADDR(allocated_idx);
+ dbgq(DBG_MM, "page_alloc_n(%lu): [0x%p, 0x%p)\t\t%lu pages remain\n",
+ npages, (void *)(PHYS_OFFSET + addr),
+ (void *)(PHYS_OFFSET + addr + (npages << PAGE_SHIFT)), page_freecount);
+ _btree_expensive_sanity_check();
+ return (void *)(addr + PHYS_OFFSET);
+}
+
+void *page_alloc_n(size_t npages)
+{
+ return page_alloc_n_bounded(npages, (void *)~0UL);
+}
+
+// this is really only used for setting up initial page tables
+// this memory will be immediately overriden, so no need to poison the memory
+void *page_alloc_n_bounded(size_t npages, void *max_paddr)
+{
+ KASSERT(npages > 0 && npages <= (1UL << max_order));
+ if (npages > page_freecount)
+ {
+ return 0;
+ }
+ // a note on max_pages: so long as we never mark a page that is beyond our
+ // RAM as available, we will never allocate it. So put all those checks at
+ // the free and map functions
+
+ // find the smallest order that will fit npages
+ uintptr_t max_page_number =
+ ((uintptr_t)max_paddr >> PAGE_SHIFT) - npages + 1;
+
+ // [+] TODO intel-specific optimization possible here?
+ size_t smallest_order = 0;
+ while ((1UL << smallest_order) < npages)
+ smallest_order++;
+
+ for (size_t actual_order = smallest_order; actual_order <= max_order;
+ actual_order++)
+ {
+ if (!count_available_by_order[actual_order])
+ {
+ continue;
+ }
+ uintptr_t idx = min_available_idx_by_order[actual_order];
+ KASSERT(idx >= BTREE_ROW_START_INDEX(actual_order) &&
+ idx < BTREE_ROW_END_INDEX(actual_order));
+ if ((idx - BTREE_ROW_START_INDEX(actual_order)) * (1 << actual_order) <
+ max_page_number)
+ {
+ KASSERT((idx - BTREE_ROW_START_INDEX(actual_order)) *
+ (1 << actual_order) <
+ max_pages);
+
+ void *ret = _btree_alloc(npages, idx, smallest_order, actual_order);
+ KASSERT(((uintptr_t)ret + (npages << PAGE_SHIFT)) <=
+ (uintptr_t)physmap_end());
+ return ret;
+ }
+ }
+ return 0;
+}
+
+void page_free_n(void *addr, size_t npages)
+{
+ dbgq(DBG_MM, "page_free_n(%lu): [0x%p, 0x%p)\t\t%lu pages remain\n", npages,
+ addr, (void *)((uintptr_t)addr + (npages << PAGE_SHIFT)),
+ page_freecount);
+ GDB_CALL_HOOK(page_free, addr, npages);
+ KASSERT(npages > 0 && npages <= (1UL << max_order) && PAGE_ALIGNED(addr));
+ uintptr_t idx = BTREE_ADDR_TO_LEAF_INDEX((uintptr_t)addr - PHYS_OFFSET);
+ KASSERT(idx + npages - BTREE_LEAF_START_INDEX <= max_pages);
+ _btree_mark_range_available(idx, npages);
+ page_freecount += npages;
+ _btree_expensive_sanity_check();
+}
+
+void page_mark_reserved(void *paddr)
+{
+ if ((uintptr_t)paddr > (max_pages << PAGE_SHIFT))
+ return;
+
+ dbgq(DBG_MM, "page_mark_reserved(0x%p): [0x%p, 0x%p)\n",
+ (void *)((uintptr_t)paddr + PHYS_OFFSET),
+ (void *)((uintptr_t)paddr + PHYS_OFFSET),
+ (void *)((uintptr_t)paddr + PHYS_OFFSET + PAGE_SIZE));
+
+ KASSERT(PAGE_ALIGNED(paddr));
+ uintptr_t idx = BTREE_ADDR_TO_LEAF_INDEX(paddr);
+ size_t order = 0;
+ while (idx && !BTREE_IS_AVAILABLE(idx))
+ {
+ idx = BTREE_PARENT(idx);
+ order++;
+ }
+ if (!BTREE_IS_AVAILABLE(idx))
+ {
+ return; // can sometimes be a part of reserved RAM anyway
+ }
+
+ BTREE_MARK_UNAVAILABLE(idx);
+ count_available_by_order[order]--;
+ _btree_update_metadata_after_removal(order, idx);
+
+ uintptr_t unavailable_leaf_idx = BTREE_ADDR_TO_LEAF_INDEX(paddr);
+ uintptr_t still_available_leaf_idx_start =
+ BTREE_ADDR_TO_LEAF_INDEX(BTREE_INDEX_TO_ADDR(idx, order));
+ uintptr_t still_available_leaf_idx_end =
+ BTREE_ADDR_TO_LEAF_INDEX(BTREE_INDEX_TO_ADDR(idx + 1, order));
+
+ _btree_mark_range_available(
+ still_available_leaf_idx_start,
+ unavailable_leaf_idx - still_available_leaf_idx_start);
+ _btree_mark_range_available(
+ unavailable_leaf_idx + 1,
+ still_available_leaf_idx_end - unavailable_leaf_idx - 1);
+
+ page_freecount--;
+
+ _btree_expensive_sanity_check();
+}
+
+size_t page_free_count() { return page_freecount; }
diff --git a/kernel/mm/page.py b/kernel/mm/page.py
new file mode 100644
index 0000000..9dfedf0
--- /dev/null
+++ b/kernel/mm/page.py
@@ -0,0 +1,47 @@
+import gdb
+
+import weenix
+import weenix.kmem
+
+
+class PageCommand(weenix.Command):
+ def __init__(self):
+ weenix.Command.__init__(self, "page", gdb.COMMAND_DATA, gdb.COMPLETE_NONE)
+
+ def invoke(self, args, tty):
+ total = 0
+ print("pagesize: {0}".format(weenix.kmem.pagesize()))
+
+ names = list()
+ blobs = list()
+ pages = list()
+ bytes = list()
+
+ for order, count in weenix.kmem.freepages().items():
+ pcount = count * (1 << order)
+ bcount = pcount * weenix.kmem.pagesize()
+ names.append("freepages[{0}]:".format(order))
+ blobs.append("{0} blob{1}".format(count, " " if (count == 1) else "s"))
+ pages.append("{0} page{1}".format(pcount, " " if (pcount == 1) else "s"))
+ bytes.append("{0} byte{1}".format(bcount, " " if (bcount == 1) else "s"))
+ total += count * (1 << order)
+
+ names.append("total:")
+ blobs.append("")
+ pages.append("{0} page{1}".format(total, " " if (total == 1) else "s"))
+ bytes.append("{0} bytes".format(total * weenix.kmem.pagesize()))
+
+ namewidth = max(list(map(lambda x: len(x), names)))
+ blobwidth = max(list(map(lambda x: len(x), blobs)))
+ pagewidth = max(list(map(lambda x: len(x), pages)))
+ bytewidth = max(list(map(lambda x: len(x), bytes)))
+
+ for name, blob, page, byte in zip(names, blobs, pages, bytes):
+ print(
+ "{1:<{0}} {3:>{2}} {5:>{4}} {7:>{6}}".format(
+ namewidth, name, blobwidth, blob, pagewidth, page, bytewidth, byte
+ )
+ )
+
+
+PageCommand()
diff --git a/kernel/mm/pagecache.c b/kernel/mm/pagecache.c
new file mode 100644
index 0000000..b1763ba
--- /dev/null
+++ b/kernel/mm/pagecache.c
@@ -0,0 +1,23 @@
+#include "errno.h"
+#include "globals.h"
+#include "kernel.h"
+#include "util/debug.h"
+
+#include "mm/pframe.h"
+
+long pagecache_get_page(pframe_t *pf) {
+ if (pf->pf_addr) {
+ // all set
+ return 1;
+ }
+ //Somehow load the page
+ KASSERT(0 && "page not in pagecache");
+ return 0;
+}
+
+#ifdef NO
+void pagecache_newsource(pframe_t pf, blockdev_t *dev, long loc) {
+ pf->pf_srcdev.pf_dev = dev;
+ pf->pf_loc = loc;
+}
+#endif \ No newline at end of file
diff --git a/kernel/mm/pagetable.c b/kernel/mm/pagetable.c
new file mode 100644
index 0000000..daf49ef
--- /dev/null
+++ b/kernel/mm/pagetable.c
@@ -0,0 +1,873 @@
+#include "errno.h"
+#include "globals.h"
+#include "kernel.h"
+#include "types.h"
+
+#include "mm/mm.h"
+#include "mm/pframe.h"
+#include "mm/mobj.h"
+
+#include "util/debug.h"
+#include "util/string.h"
+
+#include "vm/pagefault.h"
+
+typedef enum
+{
+ UNMAPPED,
+ PAGE_4KB,
+ PAGE_2MB,
+ PAGE_1GB
+} vaddr_map_status;
+
+static pml4_t *global_kernel_only_pml4;
+
+void pt_set(pml4_t *pml4)
+{
+ KASSERT((void *)pml4 >= physmap_start());
+ uintptr_t phys_addr = pt_virt_to_phys((uintptr_t)pml4);
+ __asm__ volatile("movq %0, %%cr3" ::"r"(phys_addr)
+ : "memory");
+}
+
+/*
+ * Don't use this for proc_create. You want each new proc to have a copy
+ * of the current page table (see pt_create).
+ *
+ * Returns a pointer to the current pagetable (a virtual address).
+ */
+inline pml4_t *pt_get()
+{
+ uintptr_t pml4;
+ __asm__ volatile("movq %%cr3, %0"
+ : "=r"(pml4));
+ return (pml4_t *)(pml4 + PHYS_OFFSET);
+}
+
+vaddr_map_status _vaddr_status(pml4_t *pml4, uintptr_t vaddr)
+{
+ uint64_t idx;
+ pml4_t *table = pml4;
+
+ idx = PML4E(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return UNMAPPED;
+ }
+ table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PDP (1GB pages)
+ idx = PDPE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return UNMAPPED;
+ }
+ if (IS_1GB_PAGE(table->phys[idx]))
+ {
+ return PAGE_1GB;
+ }
+ table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PD (2MB pages)
+ idx = PDE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return UNMAPPED;
+ }
+ if (IS_2MB_PAGE(table->phys[idx]))
+ {
+ return PAGE_2MB;
+ }
+ table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PT (4KB pages)
+ idx = PTE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return UNMAPPED;
+ }
+ return PAGE_4KB;
+}
+
+uintptr_t pt_virt_to_phys_helper(pml4_t *table, uintptr_t vaddr)
+{
+ if (vaddr >= (uintptr_t)physmap_start() &&
+ vaddr < (uintptr_t)physmap_end())
+ {
+ return vaddr - PHYS_OFFSET;
+ }
+
+ uint64_t idx;
+
+ // PML4
+ idx = PML4E(vaddr);
+ KASSERT(IS_PRESENT(table->phys[idx]));
+ table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PDP (1GB pages)
+ idx = PDPE(vaddr);
+ KASSERT(IS_PRESENT(table->phys[idx]));
+ if (USE_1GB_PAGES && IS_1GB_PAGE(table->phys[idx]))
+ {
+ return PAGE_ALIGN_DOWN_1GB(table->phys[idx]) + PAGE_OFFSET_1GB(vaddr);
+ }
+ table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PD (2MB pages)
+ idx = PDE(vaddr);
+ KASSERT(IS_PRESENT(table->phys[idx]));
+ if (USE_2MB_PAGES && IS_2MB_PAGE(table->phys[idx]))
+ {
+ return PAGE_ALIGN_DOWN_2MB(table->phys[idx]) + PAGE_OFFSET_2MB(vaddr);
+ }
+ table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PT (4KB pages)
+ idx = PTE(vaddr);
+
+ KASSERT(IS_PRESENT(table->phys[idx]));
+
+ return (uintptr_t)PAGE_ALIGN_DOWN(table->phys[idx]) + PAGE_OFFSET(vaddr);
+}
+
+uintptr_t pt_virt_to_phys(uintptr_t vaddr)
+{
+ if (vaddr >= (uintptr_t)physmap_start() &&
+ vaddr < (uintptr_t)physmap_end())
+ {
+ // if the address is within the PHYS_MAP region, then subtract the
+ // PHYS_OFFSET to get the physical address. There is a one-to-one mapping
+ // between virtual and physical addresses in this region.
+ return vaddr - PHYS_OFFSET;
+ }
+ return pt_virt_to_phys_helper(pt_get(), vaddr);
+}
+
+void _fill_pt(pt_t *pt, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax)
+{
+ for (uintptr_t idx = PTE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax;
+ idx++, paddr += PAGE_SIZE, vaddr += PAGE_SIZE)
+ {
+ pt->phys[idx] = (uintptr_t)paddr | PT_PRESENT | PT_WRITE;
+ }
+}
+
+long _fill_pd(pd_t *pd, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax,
+ uintptr_t max_paddr)
+{
+ for (uintptr_t idx = PDE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax;
+ idx++, paddr += PT_VADDR_SIZE, vaddr += PT_VADDR_SIZE)
+ {
+ KASSERT(!IS_PRESENT(pd->phys[idx]));
+#if USE_2MB_PAGES
+ if (vmax - vaddr >= PT_VADDR_SIZE)
+ {
+ pd->phys[idx] = paddr | PT_PRESENT | PT_WRITE | PT_SIZE;
+ continue;
+ }
+#endif
+
+ uintptr_t pt = (uintptr_t)page_alloc_bounded((void *)max_paddr);
+ if (!pt)
+ {
+ return 1;
+ }
+ pt -= PHYS_OFFSET;
+
+ memset((void *)pt, 0, PAGE_SIZE);
+ pd->phys[idx] = pt | PT_PRESENT | PT_WRITE;
+ _fill_pt((pt_t *)pt, paddr, vaddr, vmax);
+ }
+ return 0;
+}
+
+long _fill_pdp(pdp_t *pdp, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax,
+ uintptr_t max_paddr)
+{
+ for (uintptr_t idx = PDPE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax;
+ idx++, paddr += PD_VADDR_SIZE, vaddr += PD_VADDR_SIZE)
+ {
+ KASSERT(!IS_PRESENT(pdp->phys[idx]));
+#if USE_1GB_PAGES
+ if (vmax - vaddr >= PD_VADDR_SIZE)
+ {
+ pdp->phys[idx] = paddr | PT_PRESENT | PT_WRITE | PT_SIZE;
+ continue;
+ }
+#endif
+
+ uintptr_t pd = (uintptr_t)page_alloc_bounded((void *)max_paddr);
+ if (!pd)
+ {
+ return 1;
+ }
+ pd -= PHYS_OFFSET;
+
+ memset((void *)pd, 0, PAGE_SIZE);
+ pdp->phys[idx] = pd | PT_PRESENT | PT_WRITE;
+ if (_fill_pd((pd_t *)pd, paddr, vaddr, vmax, max_paddr))
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+long _fill_pml4(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax,
+ uintptr_t max_paddr)
+{
+ for (uintptr_t idx = PML4E(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax;
+ idx++, paddr += PDP_VADDR_SIZE, vaddr += PDP_VADDR_SIZE)
+ {
+ KASSERT(!IS_PRESENT(pml4->phys[idx]));
+
+ uintptr_t pdp = (uintptr_t)page_alloc_bounded((void *)max_paddr);
+ if (!pdp)
+ {
+ return 1;
+ }
+ pdp -= PHYS_OFFSET;
+
+ memset((void *)pdp, 0, PAGE_SIZE);
+ pml4->phys[idx] = pdp | PT_PRESENT | PT_WRITE;
+ if (_fill_pdp((pdp_t *)pdp, paddr, vaddr, vmax, max_paddr))
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+long pt_map(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr, uint32_t pdflags,
+ uint32_t ptflags)
+{
+ return pt_map_range(pml4, paddr, vaddr, vaddr + PAGE_SIZE, pdflags,
+ ptflags);
+}
+
+long pt_map_range(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr,
+ uintptr_t vmax, uint32_t pdflags, uint32_t ptflags)
+{
+ dbg(DBG_PGTBL, "[0x%p, 0x%p) mapped to 0x%p; pml4: 0x%p\n", (void *)vaddr,
+ (void *)vmax, (void *)paddr, pml4);
+ KASSERT(PAGE_ALIGNED(paddr) && PAGE_ALIGNED(vaddr) && PAGE_ALIGNED(vmax));
+ KASSERT(vmax > vaddr && (ptflags & PAGE_MASK) == 0 &&
+ (pdflags & PAGE_MASK) == 0);
+ KASSERT((pdflags & PT_USER) == (ptflags & PT_USER));
+ KASSERT(!(pdflags & PT_SIZE) && !(ptflags & PT_SIZE));
+
+ while (vaddr < vmax)
+ {
+ uint64_t size = vmax - vaddr;
+
+ uint64_t idx = PML4E(vaddr);
+ pml4_t *table = pml4;
+
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ uintptr_t page = (uintptr_t)page_alloc();
+ if (!page)
+ {
+ return -ENOMEM;
+ }
+ memset((void *)page, 0, PAGE_SIZE);
+ KASSERT(pt_virt_to_phys(page) == page - PHYS_OFFSET);
+ KASSERT(*(uintptr_t *)page == 0);
+ table->phys[idx] = (page - PHYS_OFFSET) | pdflags;
+ }
+ else
+ {
+ // can't split up if control flags don't match, so liberally include
+ // all of them
+ table->phys[idx] |= pdflags;
+ }
+ table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PDP (1GB pages)
+ idx = PDPE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+#if USE_1GB_PAGES
+ if (PAGE_ALIGNED_1GB(vaddr) && size > PAGE_SIZE_1GB)
+ {
+ table->phys[idx] = (uintptr_t)paddr | ptflags | PT_SIZE;
+ paddr += PAGE_SIZE_1GB;
+ vaddr += PAGE_SIZE_1GB;
+ continue;
+ }
+#endif
+ uintptr_t page = (uintptr_t)page_alloc();
+ if (!page)
+ {
+ return -ENOMEM;
+ }
+ memset((void *)page, 0, PAGE_SIZE);
+ table->phys[idx] = (page - PHYS_OFFSET) | pdflags;
+ }
+ else if (IS_1GB_PAGE(table->phys[idx]))
+ {
+ if (PAGE_SAME_1GB(table->phys[idx], paddr) &&
+ PAGE_OFFSET_1GB(paddr) == PAGE_OFFSET_1GB(vaddr) &&
+ PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE == pdflags)
+ {
+ vaddr = PAGE_ALIGN_UP_1GB(vaddr + 1);
+ continue;
+ }
+ pd_t *pd = page_alloc();
+ if (!pd)
+ {
+ return -ENOMEM;
+ }
+ for (unsigned i = 0; i < PT_ENTRY_COUNT; i++)
+ {
+ pd->phys[i] =
+ table->phys[idx] +
+ i * PAGE_SIZE_2MB; // keeps all flags, including PT_SIZE
+ }
+ table->phys[idx] =
+ ((uintptr_t)pd - PHYS_OFFSET) |
+ pdflags; // overwrite flags as well for particular entry
+ }
+ else
+ {
+ table->phys[idx] |= pdflags;
+ }
+ table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PD (2MB pages)
+ idx = PDE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+#if USE_2MB_PAGES
+ if (PAGE_ALIGNED_2MB(vaddr) && size > PAGE_SIZE_2MB)
+ {
+ table->phys[idx] = (uintptr_t)paddr | ptflags | PT_SIZE;
+ paddr += PAGE_SIZE_2MB;
+ vaddr += PAGE_SIZE_2MB;
+ continue;
+ }
+#endif
+ uintptr_t page = (uintptr_t)page_alloc();
+ if (!page)
+ {
+ return -ENOMEM;
+ }
+ memset((void *)page, 0, PAGE_SIZE);
+ table->phys[idx] = (page - PHYS_OFFSET) | pdflags;
+ }
+ else if (IS_2MB_PAGE(table->phys[idx]))
+ {
+ if (PAGE_SAME_2MB(table->phys[idx], paddr) &&
+ PAGE_OFFSET_2MB(paddr) == PAGE_OFFSET_2MB(vaddr) &&
+ PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE == ptflags)
+ {
+ vaddr = PAGE_ALIGN_UP_2MB(vaddr + 1);
+ continue;
+ }
+ pt_t *pt = page_alloc();
+ if (!pt)
+ {
+ return -ENOMEM;
+ }
+ for (unsigned i = 0; i < PT_ENTRY_COUNT; i++)
+ {
+ pt->phys[i] = table->phys[idx] + i * PAGE_SIZE -
+ PT_SIZE; // remove PT_SIZE flag
+ }
+ table->phys[idx] =
+ ((uintptr_t)pt - PHYS_OFFSET) | pdflags; // overwrite flags
+ }
+ else
+ {
+ table->phys[idx] |= pdflags;
+ }
+ table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PT (4KB pages)
+
+ idx = PTE(vaddr);
+ table->phys[idx] = (uintptr_t)paddr | ptflags;
+
+ KASSERT(IS_PRESENT(table->phys[idx]));
+
+ paddr += PAGE_SIZE;
+ vaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+static long _pt_fault_handler(regs_t *regs)
+{
+ uintptr_t vaddr;
+ /* Get the address where the fault occurred */
+ __asm__ volatile("movq %%cr2, %0"
+ : "=r"(vaddr));
+ uintptr_t cause = regs->r_err;
+
+ /* Check if pagefault was in user space (otherwise, BAD!) */
+ if (cause & FAULT_USER)
+ {
+ handle_pagefault(vaddr, cause);
+ }
+ else
+ {
+ dump_registers(regs);
+ panic("\nKernel page fault at vaddr 0x%p\n", (void *)vaddr);
+ }
+ return 0;
+}
+
+void pt_init()
+{
+ static long inited = 0;
+ if (!inited)
+ {
+ inited = 1;
+ // allocate a page to set up the new page table structure
+ // important caveat: we have not mapped in the physmap region, which
+ // is where the addresses from page_alloc come, so we use the actual
+ // physical addrses of the page, which we request to be in the
+ // first 4MB of RAM, as they are identity-mapped by the boot-time
+ // page tables
+ uintptr_t max_paddr = (1UL << 22);
+ pml4_t *pml4 = page_alloc_bounded((void *)max_paddr);
+ if (!pml4)
+ panic("ran out of memory in pt_init");
+ pml4 = (pml4_t *)((uintptr_t)pml4 - PHYS_OFFSET);
+ KASSERT((uintptr_t)pml4 < max_paddr);
+ memset(pml4, 0, PAGE_SIZE);
+
+ // map the kernel in to it's expected virtual memory address
+ if (_fill_pml4(pml4, KERNEL_PHYS_BASE, KERNEL_VMA + KERNEL_PHYS_BASE,
+ KERNEL_VMA + KERNEL_PHYS_END, max_paddr))
+ panic("ran out of memory in pt_init");
+
+ // map in physmap
+ if (_fill_pml4(pml4, 0, (uintptr_t)physmap_start(),
+ (uintptr_t)physmap_end(), max_paddr))
+ panic("ran out of memory in pt_init");
+
+ page_init_finish();
+
+ // use the kernel memory address synonym instead of the physical address
+ // identity map for pml4 make the MMU use the new pml4
+ pt_set((pml4_t *)((uintptr_t)pml4 + PHYS_OFFSET));
+ global_kernel_only_pml4 = (pml4_t *)((uintptr_t)pml4 + PHYS_OFFSET);
+ // pt_unmap_range(global_kernel_only_pml4, USER_MEM_LOW, USER_MEM_HIGH);
+ intr_register(INTR_PAGE_FAULT, _pt_fault_handler);
+ }
+ pt_set(global_kernel_only_pml4);
+}
+
+pt_t *clone_pt(pt_t *pt)
+{
+ pt_t *clone = page_alloc();
+ dbg(DBG_PRINT, "cloning pt at 0x%p to 0x%p\n", pt, clone);
+ if (clone)
+ {
+ memcpy(clone, pt, PAGE_SIZE);
+ }
+ return clone;
+}
+
+pd_t *clone_pd(pd_t *pd)
+{
+ pd_t *clone = page_alloc();
+ dbg(DBG_PRINT, "cloning pd at 0x%p to 0x%p\n", pd, clone);
+ if (!clone)
+ {
+ return NULL;
+ }
+ memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what
+ // we have allocated
+ for (unsigned i = 0; i < PT_ENTRY_COUNT; i++)
+ {
+ // dbg(DBG_PRINT, "checking pd i = %u\n", i);
+ if (pd->phys[i])
+ {
+ if (IS_2MB_PAGE(pd->phys[i]))
+ {
+ clone->phys[i] = pd->phys[i];
+ continue;
+ }
+ pt_t *cloned_pt =
+ clone_pt((pt_t *)((pd->phys[i] & PAGE_MASK) + PHYS_OFFSET));
+ if (!cloned_pt)
+ {
+ return NULL;
+ }
+ clone->phys[i] = (((uintptr_t)cloned_pt) - PHYS_OFFSET) |
+ PAGE_FLAGS(pd->phys[i]);
+ }
+ else
+ {
+ clone->phys[i] = 0;
+ }
+ }
+ return clone;
+}
+
+pdp_t *clone_pdp(pdp_t *pdp)
+{
+ pdp_t *clone = page_alloc();
+ dbg(DBG_PRINT, "cloning pdp at 0x%p to 0x%p\n", pdp, clone);
+ if (!clone)
+ {
+ return NULL;
+ }
+ memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what
+ // we have allocated
+ for (unsigned i = 0; i < PT_ENTRY_COUNT; i++)
+ {
+ // dbg(DBG_PRINT, "checking pdp i = %u\n", i);
+ if (pdp->phys[i])
+ {
+ if (IS_1GB_PAGE(pdp->phys[i]))
+ {
+ clone->phys[i] = pdp->phys[i];
+ continue;
+ }
+ pd_t *cloned_pd =
+ clone_pd((pd_t *)((pdp->phys[i] & PAGE_MASK) + PHYS_OFFSET));
+ if (!cloned_pd)
+ {
+ return NULL;
+ }
+ clone->phys[i] = (((uintptr_t)cloned_pd) - PHYS_OFFSET) |
+ PAGE_FLAGS(pdp->phys[i]);
+ }
+ else
+ {
+ clone->phys[i] = 0;
+ }
+ }
+ return clone;
+}
+
+pml4_t *clone_pml4(pml4_t *pml4, long include_user_mappings)
+{
+ pml4_t *clone = page_alloc();
+ dbg(DBG_PRINT, "cloning pml4 at 0x%p to 0x%p\n", pml4, clone);
+ if (!clone)
+ {
+ return NULL;
+ }
+ memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what
+ // we have allocated
+ for (uintptr_t i = include_user_mappings ? 0 : PT_ENTRY_COUNT / 2;
+ i < PT_ENTRY_COUNT; i++)
+ {
+ // dbg(DBG_PRINT, "checking pml4 i = %u\n", i);
+ if (pml4->phys[i])
+ {
+ pdp_t *cloned_pdp =
+ clone_pdp((pdp_t *)((pml4->phys[i] & PAGE_MASK) + PHYS_OFFSET));
+ if (!cloned_pdp)
+ {
+ pt_destroy(clone);
+ return NULL;
+ }
+ clone->phys[i] = (((uintptr_t)cloned_pdp) - PHYS_OFFSET) |
+ PAGE_FLAGS(pml4->phys[i]);
+ }
+ else
+ {
+ clone->phys[i] = 0;
+ }
+ }
+ return clone;
+}
+
+pml4_t *pt_create() { return clone_pml4(pt_get(), 0); }
+
+void pt_destroy_helper(pt_t *pt, long depth)
+{
+ // 4 = pml4, 3 = pdp, 2 = pd, 1 = pt
+ if (depth != 1)
+ {
+ for (uintptr_t i = 0; i < PT_ENTRY_COUNT; i++)
+ {
+ if (!pt->phys[i] || (PT_SIZE & pt->phys[i]))
+ {
+ continue;
+ }
+ KASSERT(IS_PRESENT(pt->phys[i]) && (pt->phys[i] & PAGE_MASK));
+ pt_destroy_helper((pt_t *)((pt->phys[i] & PAGE_MASK) + PHYS_OFFSET),
+ depth - 1);
+ pt->phys[i] = 0;
+ }
+ }
+ page_free(pt);
+}
+
+void pt_destroy(pml4_t *pml4) { pt_destroy_helper(pml4, 4); }
+
+void pt_unmap(pml4_t *pml4, uintptr_t vaddr)
+{
+ pt_unmap_range(pml4, vaddr, vaddr + PAGE_SIZE);
+}
+
+void pt_unmap_range(pml4_t *pml4, uintptr_t vaddr, uintptr_t vmax)
+{
+ // TODO reclaim pages on-the-fly?
+
+ dbg(DBG_PGTBL, "virt[0x%p, 0x%p); pml4: 0x%p\n", (void *)vaddr,
+ (void *)vmax, pml4);
+ KASSERT(PAGE_ALIGNED(vaddr) && PAGE_ALIGNED(vmax) && vmax > vaddr);
+
+ uintptr_t vaddr_start = vaddr;
+
+ while (vaddr < vmax)
+ {
+ uint64_t size = vmax - vaddr;
+
+ uint64_t idx = PML4E(vaddr);
+ pml4_t *table = pml4;
+
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ vaddr = PAGE_ALIGN_UP_512GB(vaddr + 1);
+ continue;
+ }
+ table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PDP (1GB pages)
+ idx = PDPE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ vaddr = PAGE_ALIGN_UP_1GB(vaddr + 1);
+ ;
+ continue;
+ }
+ if (IS_1GB_PAGE(table->phys[idx]))
+ {
+ if (PAGE_ALIGNED_1GB(vaddr) && size >= PAGE_SIZE_1GB)
+ {
+ table->phys[idx] = 0;
+ vaddr += PAGE_SIZE_1GB;
+ }
+ else
+ {
+ pd_t *pd = page_alloc();
+ if (!pd)
+ {
+ panic(
+ "Ran out of memory during pt_unmap_range; recovery "
+ "from this situation has not yet been implemented!");
+ }
+ uint64_t unmap_start = PDE(vaddr);
+ uint64_t unmap_end =
+ PAGE_SAME_1GB(vaddr, vmax) ? PDE(vmax) : 512;
+ for (unsigned i = 0; i < unmap_start; i++)
+ {
+ pd->phys[i] = table->phys[idx] +
+ i * PAGE_SIZE_2MB; // keeps all flags,
+ // including PT_SIZE
+ }
+ memset(&pd->phys[unmap_start], 0,
+ sizeof(uint64_t) * (unmap_end - unmap_start));
+ vaddr += (unmap_end - unmap_start) * PAGE_SIZE_2MB;
+ for (uintptr_t i = unmap_end; unmap_end < PT_ENTRY_COUNT; i++)
+ {
+ pd->phys[i] = table->phys[idx] +
+ i * PAGE_SIZE_2MB; // keeps all flags,
+ // including PT_SIZE
+ }
+ table->phys[idx] = ((uintptr_t)pd - PHYS_OFFSET) |
+ PAGE_CONTROL_FLAGS(table->phys[idx]);
+ }
+ continue;
+ }
+ table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PD (2MB pages)
+ idx = PDE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ vaddr = PAGE_ALIGN_UP_2MB(vaddr + 1);
+ continue;
+ }
+ if (IS_2MB_PAGE(table->phys[idx]))
+ {
+ if (PAGE_ALIGNED_2MB(vaddr) && size >= PAGE_SIZE_2MB)
+ {
+ table->phys[idx] = 0;
+ vaddr += PAGE_SIZE_2MB;
+ }
+ else
+ {
+ pt_t *pt = page_alloc();
+ if (!pt)
+ {
+ panic(
+ "Ran out of memory during pt_unmap_range; recovery "
+ "from this situation has not yet been implemented!");
+ }
+ uint64_t unmap_start = PTE(vaddr);
+ uint64_t unmap_end =
+ PAGE_SAME_2MB(vaddr, vmax) ? PTE(vmax) : 512;
+ for (unsigned i = 0; i < unmap_start; i++)
+ {
+ pt->phys[i] = table->phys[idx] + i * PAGE_SIZE -
+ PT_SIZE; // remove PT_SIZE flag
+ }
+ memset(&pt->phys[unmap_start], 0,
+ sizeof(uint64_t) * (unmap_end - unmap_start));
+ vaddr += (unmap_end - unmap_start) * PAGE_SIZE;
+ for (uintptr_t i = unmap_end; unmap_end < PT_ENTRY_COUNT; i++)
+ {
+ pt->phys[i] = table->phys[idx] + i * PAGE_SIZE -
+ PT_SIZE; // remove PT_SIZE flag
+ }
+ table->phys[idx] =
+ ((uintptr_t)pt - PHYS_OFFSET) |
+ (PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE);
+ }
+ continue;
+ }
+ table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PT (4KB pages)
+ idx = PTE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ vaddr += PAGE_SIZE;
+ continue;
+ }
+ table->phys[idx] = 0;
+
+ vaddr += PAGE_SIZE;
+ }
+ KASSERT(_vaddr_status(pml4, vaddr_start) == UNMAPPED);
+}
+
+static char *entry_strings[] = {
+ "4KB",
+ "2MB",
+ "1GB",
+ "512GB",
+};
+
+inline long _vaddr_status_detailed(pml4_t *pml4, uintptr_t vaddr)
+{
+ uintptr_t idx;
+ pml4_t *table = pml4;
+
+ idx = PML4E(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return -4;
+ }
+ table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PDP (1GB pages)
+ idx = PDPE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return -3;
+ }
+ if (IS_1GB_PAGE(table->phys[idx]))
+ {
+ return 3;
+ }
+ table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PD (2MB pages)
+ idx = PDE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return -2;
+ }
+ if (IS_2MB_PAGE(table->phys[idx]))
+ {
+ return 2;
+ }
+ table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET);
+
+ // PT (4KB pages)
+ idx = PTE(vaddr);
+ if (!IS_PRESENT(table->phys[idx]))
+ {
+ return -1;
+ }
+ return 1;
+}
+
+void check_invalid_mappings(pml4_t *pml4, vmmap_t *vmmap, char *prompt)
+{
+ // checks that anything that is mapped in pml4 actually should be according
+ // to vmmap
+
+ uintptr_t vaddr = USER_MEM_LOW;
+ while (vaddr < USER_MEM_HIGH)
+ {
+ long state = _vaddr_status_detailed(pml4, vaddr);
+ if (state > 0)
+ {
+ uintptr_t paddr = pt_virt_to_phys_helper(pml4, vaddr);
+
+ vmarea_t *vma = vmmap_lookup(vmmap, ADDR_TO_PN(vaddr));
+ if (!vma)
+ {
+ dbg(DBG_PGTBL,
+ "[+] %s: pml4 0x%p, 0x%p (paddr: 0x%p) cannot be found in "
+ "vmmap!\n",
+ prompt, pml4, (void *)vaddr, (void *)paddr);
+ pt_unmap(pml4, vaddr);
+ }
+ else
+ {
+ pframe_t *pf = NULL;
+ uintptr_t pagenum =
+ vma->vma_off + (ADDR_TO_PN(vaddr) - vma->vma_start);
+
+ mobj_lock(vma->vma_obj);
+ long ret = mobj_get_pframe(vma->vma_obj, pagenum, 0, &pf);
+ mobj_unlock(vma->vma_obj);
+ if (ret)
+ {
+ dbg(DBG_PGTBL,
+ "[+] %s: pml4 0x%p, the page frame for virtual address "
+ "0x%p (mapping to 0x%p) could not be found!\n",
+ prompt, pml4, (void *)vaddr, (void *)paddr);
+ pt_unmap(pml4, vaddr);
+ }
+ else
+ {
+ uintptr_t pf_paddr =
+ pt_virt_to_phys_helper(pml4, (uintptr_t)pf->pf_addr);
+ if (pf_paddr != paddr)
+ {
+ dbg(DBG_PGTBL,
+ "[+] %s: pml4 0x%p, 0x%p (paddr: 0x%p) supposed to "
+ "be 0x%p (obj: 0x%p, %lu)\n",
+ prompt, pml4, (void *)vaddr, (void *)paddr,
+ (void *)pf_paddr, vma->vma_obj, pf->pf_pagenum);
+ pt_unmap(pml4, vaddr);
+ }
+ }
+ }
+ }
+ switch (state)
+ {
+ case 1:
+ case -1:
+ vaddr = (uintptr_t)PAGE_ALIGN_UP(vaddr + 1);
+ break;
+ case -2:
+ vaddr = (uintptr_t)PAGE_ALIGN_UP_2MB(vaddr + 1);
+ break;
+ case -3:
+ vaddr = (uintptr_t)PAGE_ALIGN_UP_1GB(vaddr + 1);
+ break;
+ case -4:
+ vaddr = (uintptr_t)PAGE_ALIGN_UP_512GB(vaddr + 1);
+ break;
+ case 2:
+ case 3:
+ default:
+ panic("should not get here!");
+ }
+ }
+}
diff --git a/kernel/mm/pagetable.gdb b/kernel/mm/pagetable.gdb
new file mode 100644
index 0000000..b145804
--- /dev/null
+++ b/kernel/mm/pagetable.gdb
@@ -0,0 +1,25 @@
+define pagetable
+ if $argc > 0
+ set $proc = proc_lookup($arg0)
+ if $proc != NULL
+ printf "Process %i (%s):\n", $proc->p_pid, $proc->p_name
+ set $pagedir = $proc->p_pml4
+ else
+ printf "No process with PID %i exists\n", $arg0
+ set $pagedir = NULL
+ end
+ else
+ printf "Current mappings:\n"
+ set $pagedir = current_pagedir
+ end
+
+ if $pagedir != NULL
+ kinfo pt_mapping_info current_pagedir
+ end
+end
+document pagetable
+Without arguments displays current page table mappings in the form
+"[vstart, vend) => [pstart, pend)". Takes an optional integer argument
+to specify the PID of a process whose page table mappings should be
+printed instead.
+end
diff --git a/kernel/mm/pframe.c b/kernel/mm/pframe.c
new file mode 100644
index 0000000..6eff123
--- /dev/null
+++ b/kernel/mm/pframe.c
@@ -0,0 +1,59 @@
+#include "globals.h"
+
+#include "mm/pframe.h"
+#include "mm/slab.h"
+
+#include "util/debug.h"
+#include "util/string.h"
+
+static slab_allocator_t *pframe_allocator;
+
+void pframe_init()
+{
+ pframe_allocator = slab_allocator_create("pframe", sizeof(pframe_t));
+ KASSERT(pframe_allocator);
+}
+
+/*
+ * Create a pframe and initialize its members appropriately.
+ */
+pframe_t *pframe_create()
+{
+ pframe_t *pf = slab_obj_alloc(pframe_allocator);
+ if (!pf)
+ {
+ return NULL;
+ }
+ memset(pf, 0, sizeof(pframe_t));
+ kmutex_init(&pf->pf_mutex);
+ list_link_init(&pf->pf_link);
+ return pf;
+}
+
+/*
+ * Free the pframe (don't forget to unlock the mutex) and set *pfp = NULL
+ *
+ * The pframe must be locked, its contents not in memory (pf->pf_addr == NULL),
+ * have a pincount of 0, and not be linked into a memory object's list.
+ */
+void pframe_free(pframe_t **pfp)
+{
+ KASSERT(kmutex_owns_mutex(&(*pfp)->pf_mutex));
+ KASSERT(!(*pfp)->pf_addr);
+ KASSERT(!(*pfp)->pf_dirty);
+ KASSERT(!list_link_is_linked(&(*pfp)->pf_link));
+ kmutex_unlock(&(*pfp)->pf_mutex);
+ slab_obj_free(pframe_allocator, *pfp);
+ *pfp = NULL;
+}
+
+/*
+ * Unlock the pframe and set *pfp = NULL
+ */
+void pframe_release(pframe_t **pfp)
+{
+ pframe_t *pf = *pfp;
+ KASSERT(kmutex_owns_mutex(&pf->pf_mutex));
+ *pfp = NULL;
+ kmutex_unlock(&pf->pf_mutex);
+}
diff --git a/kernel/mm/slab.c b/kernel/mm/slab.c
new file mode 100644
index 0000000..bec70d1
--- /dev/null
+++ b/kernel/mm/slab.c
@@ -0,0 +1,550 @@
+// SMP.1 + SMP.3
+// spinlocks + mask interrupts
+/*
+ * slab_alloc.c - Kernel memory allocator
+ * Jason Lango <jal@cs.brown.edu>
+ *
+ * This implementation is based on the description of slab allocation
+ * (used in Solaris and Linux) from UNIX Internals: The New Frontiers,
+ * by Uresh Vahalia.
+ *
+ * Note that there is no need for locking in allocation and deallocation because
+ * it never blocks nor is used by an interrupt handler. Hurray for non
+ * preemptible kernels!
+ *
+ * darmanio: ^ lol, look at me now :D
+ */
+
+#include "types.h"
+
+#include "mm/mm.h"
+#include "mm/page.h"
+#include "mm/slab.h"
+
+#include "proc/spinlock.h"
+
+#include "util/debug.h"
+#include "util/gdb.h"
+#include "util/string.h"
+
+#ifdef SLAB_REDZONE
+#define front_rz(obj) (*(uintptr_t *)(obj))
+#define rear_rz(cache, obj) \
+ (*(uintptr_t *)(((uintptr_t)(obj)) + (cache)->sa_objsize - \
+ sizeof(uintptr_t)))
+
+#define VERIFY_REDZONES(cache, obj) \
+ do \
+ { \
+ if (front_rz(obj) != SLAB_REDZONE) \
+ panic("alloc: red-zone check failed: *(0x%p)=0x%.8lx\n", \
+ (void *)&front_rz(obj), front_rz(obj)); \
+ if (rear_rz(cache, obj) != SLAB_REDZONE) \
+ panic("alloc: red-zone check failed: *(0x%p)=0x%.8lx\n", \
+ (void *)&rear_rz(cache, obj), rear_rz(cache, obj)); \
+ } while (0);
+
+#endif
+
+struct slab
+{
+ struct slab *s_next; /* link on list of slabs */
+ size_t s_inuse; /* number of allocated objs */
+ void *s_free; /* head of obj free list */
+ void *s_addr; /* start address */
+};
+
+typedef struct slab_allocator
+{
+ const char *sa_name; /* user-provided name */
+ size_t sa_objsize; /* object size */
+ struct slab *sa_slabs; /* head of slab list */
+ size_t sa_order; /* npages = (1 << order) */
+ size_t sa_slab_nobjs; /* number of objs per slab */
+ struct slab_allocator *sa_next; /* link on global list of allocators */
+} slab_allocator_t;
+
+/* Stored at the end of every object to keep track of the
+ associated slab when allocated or a pointer to the next free object */
+typedef struct slab_bufctl
+{
+ union {
+ void *sb_next; /* next free object */
+ struct slab *sb_slab; /* containing slab */
+ } u;
+#ifdef SLAB_CHECK_FREE
+ uint8_t sb_free; /* true if is object is free */
+#endif
+} slab_bufctl_t;
+#define sb_next u.sb_next
+#define sb_slab u.sb_slab
+
+/* Returns a pointer to the start of the bufctl struct */
+#define obj_bufctl(allocator, obj) \
+ ((slab_bufctl_t *)(((uintptr_t)(obj)) + (allocator)->sa_objsize))
+/* Given a pointer to bufctrl, returns a pointer to the start of the object */
+#define bufctl_obj(allocator, buf) \
+ ((void *)(((uintptr_t)(buf)) - (allocator)->sa_objsize))
+/* Given a pointer to the object, returns a pointer to the next object (after bufctl) */
+#define next_obj(allocator, obj) \
+ ((void *)(((uintptr_t)(obj)) + (allocator)->sa_objsize + \
+ sizeof(slab_bufctl_t)))
+
+GDB_DEFINE_HOOK(slab_obj_alloc, void *addr, slab_allocator_t *allocator)
+
+GDB_DEFINE_HOOK(slab_obj_free, void *addr, slab_allocator_t *allocator)
+
+/* Head of global list of slab allocators. This is used in the python gdb script */
+static slab_allocator_t *slab_allocators = NULL;
+
+/* Special case - allocator for allocation of slab_allocator objects. */
+static slab_allocator_t slab_allocator_allocator;
+
+/*
+ * This constant defines how many orders of magnitude (in page block
+ * sizes) we'll search for an optimal slab size (past the smallest
+ * possible slab size).
+ */
+#define SLAB_MAX_ORDER 5
+
+/**
+ * Given the object size and the number of objects, calculates
+ * the size of the slab. Each object includes a slab_bufctl_t,
+ * and each slab includes a slab struct.
+*/
+static size_t _slab_size(size_t objsize, size_t nobjs)
+{
+ return (nobjs * (objsize + sizeof(slab_bufctl_t)) + sizeof(struct slab));
+}
+
+/**
+ * Given the object size and the order, calculate how many objects
+ * can fit in a certain number of pages (excluding the slab struct).
+ *
+ * PAGE_SIZE << order effectively is just PAGE_SIZE * 2^order.
+*/
+static size_t _slab_nobjs(size_t objsize, size_t order)
+{
+ return (((PAGE_SIZE << order) - sizeof(struct slab)) /
+ (objsize + sizeof(slab_bufctl_t)));
+}
+
+static size_t _slab_waste(size_t objsize, size_t order)
+{
+ /* Waste is defined as the amount of unused space in the page
+ * block, that is the number of bytes in the page block minus
+ * the optimal slab size for that particular block size.
+ */
+ return ((PAGE_SIZE << order) -
+ _slab_size(objsize, _slab_nobjs(objsize, order)));
+}
+
+static void _calc_slab_size(slab_allocator_t *allocator)
+{
+ size_t best_order;
+ size_t best_waste;
+ size_t order;
+ size_t minorder;
+ size_t minsize;
+ size_t waste;
+
+ /* Find the minimum page block size that this slab requires. */
+ minsize = _slab_size(allocator->sa_objsize, 1);
+ for (minorder = 0; minorder < PAGE_NSIZES; minorder++)
+ {
+ if ((PAGE_SIZE << minorder) >= minsize)
+ {
+ break;
+ }
+ }
+ if (minorder == PAGE_NSIZES)
+ panic("unable to find minorder\n");
+
+ /* Start the search with the minimum block size for this slab. */
+ best_order = minorder;
+ best_waste = _slab_waste(allocator->sa_objsize, minorder);
+
+ dbg(DBG_MM, "calc_slab_size: minorder %lu, waste %lu\n", minorder,
+ best_waste);
+
+ /* Find the optimal number of objects per slab and slab size,
+ * up to a predefined (somewhat arbitrary) limit on the number
+ * of pages per slab.
+ */
+ for (order = minorder + 1; order < SLAB_MAX_ORDER; order++)
+ {
+ if ((waste = _slab_waste(allocator->sa_objsize, order)) < best_waste)
+ {
+ best_waste = waste;
+ best_order = order;
+ dbg(DBG_MM, "calc_slab_size: replacing with order %lu, waste %lu\n",
+ best_order, best_waste);
+ }
+ }
+
+ /* Finally, the best page block size wins.
+ */
+ allocator->sa_order = best_order;
+ allocator->sa_slab_nobjs = _slab_nobjs(allocator->sa_objsize, best_order);
+ KASSERT(allocator->sa_slab_nobjs);
+}
+
+/*
+ * Initializes a given allocator using the name and size passed in.
+*/
+static void _allocator_init(slab_allocator_t *allocator, const char *name,
+ size_t size)
+{
+#ifdef SLAB_REDZONE
+ /*
+ * Add space for the front and rear red-zones.
+ */
+ size += 2 * sizeof(uintptr_t);
+#endif
+
+ if (!name)
+ {
+ name = "<unnamed>";
+ }
+
+ allocator->sa_name = name;
+ allocator->sa_objsize = size;
+ allocator->sa_slabs = NULL;
+ // this will set the fields sa_order and the number of objects per slab
+ _calc_slab_size(allocator);
+
+ /* Add cache to global cache list. */
+ allocator->sa_next = slab_allocators;
+ slab_allocators = allocator;
+
+ dbg(DBG_MM, "Initialized new slab allocator:\n");
+ dbgq(DBG_MM, " Name: \"%s\" (0x%p)\n", allocator->sa_name,
+ allocator);
+ dbgq(DBG_MM, " Object Size: %lu\n", allocator->sa_objsize);
+ dbgq(DBG_MM, " Order: %lu\n", allocator->sa_order);
+ dbgq(DBG_MM, " Slab Capacity: %lu\n", allocator->sa_slab_nobjs);
+}
+
+/*
+ * Given a name and size of object will create a slab_allocator
+ * to manage slabs that store objects of size `size`, along with
+ * some metadata.
+*/
+slab_allocator_t *slab_allocator_create(const char *name, size_t size)
+{
+ slab_allocator_t *allocator;
+
+ allocator = (slab_allocator_t *)slab_obj_alloc(&slab_allocator_allocator);
+ if (!allocator)
+ {
+ return NULL;
+ }
+
+ _allocator_init(allocator, name, size);
+ return allocator;
+}
+
+/*
+ * Free a given allocator.
+*/
+void slab_allocator_destroy(slab_allocator_t *allocator)
+{
+ slab_obj_free(&slab_allocator_allocator, allocator);
+}
+
+/*
+ * In the event that a slab with free objects is not found,
+ * this routine will be called.
+*/
+static long _slab_allocator_grow(slab_allocator_t *allocator)
+{
+ void *addr;
+ void *obj;
+ struct slab *slab;
+
+ addr = page_alloc_n(1UL << allocator->sa_order);
+ if (!addr)
+ {
+ return 0;
+ }
+
+ /* Initialize each bufctl to be free and point to the next object. */
+ obj = addr;
+ for (size_t i = 0; i < (allocator->sa_slab_nobjs - 1); i++)
+ {
+#ifdef SLAB_CHECK_FREE
+ obj_bufctl(allocator, obj)->sb_free = 1;
+#endif
+ obj = obj_bufctl(allocator, obj)->sb_next = next_obj(allocator, obj);
+ }
+
+ /* The last bufctl is the tail of the list. */
+#ifdef SLAB_CHECK_FREE
+ obj_bufctl(allocator, obj)->sb_free = 1;
+#endif
+ obj_bufctl(allocator, obj)->sb_next = NULL;
+
+ /* After the last object comes the slab structure itself. */
+ slab = (struct slab *)next_obj(allocator, obj);
+
+ /*
+ * The first object in the slab will be the head of the free
+ * list and the start address of the slab.
+ */
+ slab->s_free = addr;
+ slab->s_addr = addr;
+ slab->s_inuse = 0;
+
+ /* Initialize objects. */
+ obj = addr;
+ for (size_t i = 0; i < allocator->sa_slab_nobjs; i++)
+ {
+#ifdef SLAB_REDZONE
+ front_rz(obj) = SLAB_REDZONE;
+ rear_rz(allocator, obj) = SLAB_REDZONE;
+#endif
+ obj = next_obj(allocator, obj);
+ }
+
+ dbg(DBG_MM, "Growing cache \"%s\" (0x%p), new slab 0x%p (%lu pages)\n",
+ allocator->sa_name, allocator, slab, 1UL << allocator->sa_order);
+
+ /* Place this slab into the cache. */
+ slab->s_next = allocator->sa_slabs;
+ allocator->sa_slabs = slab;
+
+ return 1;
+}
+
+/*
+ * Given an allocator, will allocate an object.
+*/
+void *slab_obj_alloc(slab_allocator_t *allocator)
+{
+ struct slab *slab;
+ void *obj;
+
+ /* Find a slab with a free object. */
+ for (;;)
+ {
+ slab = allocator->sa_slabs;
+ while (slab && (slab->s_inuse == allocator->sa_slab_nobjs))
+ slab = slab->s_next;
+ if (slab && (slab->s_inuse < allocator->sa_slab_nobjs))
+ {
+ break;
+ }
+ if (!_slab_allocator_grow(allocator))
+ {
+ return NULL;
+ }
+ }
+
+ /*
+ * Remove an object from the slab's free list. We'll use the
+ * free list pointer to store a pointer back to the containing
+ * slab.
+ */
+ obj = slab->s_free;
+ slab->s_free = obj_bufctl(allocator, obj)->sb_next;
+ obj_bufctl(allocator, obj)->sb_slab = slab;
+#ifdef SLAB_CHECK_FREE
+ obj_bufctl(allocator, obj)->sb_free = 0;
+#endif
+
+ slab->s_inuse++;
+
+ dbg(DBG_MM,
+ "Allocated object 0x%p from \"%s\" (0x%p), "
+ "slab 0x%p, inuse %lu\n",
+ obj, allocator->sa_name, allocator, allocator, slab->s_inuse);
+
+#ifdef SLAB_REDZONE
+ VERIFY_REDZONES(allocator, obj);
+
+ /*
+ * Make object pointer point past the first red-zone.
+ */
+ obj = (void *)((uintptr_t)obj + sizeof(uintptr_t));
+#endif
+
+ GDB_CALL_HOOK(slab_obj_alloc, obj, allocator);
+ return obj;
+}
+
+void slab_obj_free(slab_allocator_t *allocator, void *obj)
+{
+ struct slab *slab;
+ GDB_CALL_HOOK(slab_obj_free, obj, allocator);
+
+#ifdef SLAB_REDZONE
+ /* Move pointer back to verify that the REDZONE is unchanged. */
+ obj = (void *)((uintptr_t)obj - sizeof(uintptr_t));
+
+ VERIFY_REDZONES(allocator, obj);
+#endif
+
+#ifdef SLAB_CHECK_FREE
+ KASSERT(!obj_bufctl(allocator, obj)->sb_free && "INVALID FREE!");
+ obj_bufctl(allocator, obj)->sb_free = 1;
+#endif
+
+ slab = obj_bufctl(allocator, obj)->sb_slab;
+
+ /* Place this object back on the slab's free list. */
+ obj_bufctl(allocator, obj)->sb_next = slab->s_free;
+ slab->s_free = obj;
+
+ slab->s_inuse--;
+
+ dbg(DBG_MM, "Freed object 0x%p from \"%s\" (0x%p), slab 0x%p, inuse %lu\n",
+ obj, allocator->sa_name, allocator, slab, slab->s_inuse);
+}
+
+/*
+ * Reclaims as much memory (up to a target) from
+ * unused slabs as possible
+ * @param target - target number of pages to reclaim. If negative,
+ * try to reclaim as many pages as possible
+ * @return number of pages freed
+ */
+long slab_allocators_reclaim(long target)
+{
+ panic("slab_allocators_reclaim NYI for SMP");
+ // spinlock_lock(&allocator->sa_lock);
+ // int npages_freed = 0, npages;
+
+ // slab_allocator_t *a;
+ // struct slab *s, **prev;
+
+ // /* Go through all caches */
+ // for (a = slab_allocators; NULL != a; a = a->sa_next) {
+ // prev = &(a->sa_slabs);
+ // s = a->sa_slabs;
+ // while (NULL != s) {
+ // struct slab *next = s->s_next;
+ // if (0 == s->s_inuse) {
+ // /* Free Slab */
+ // (*prev) = next;
+ // npages = 1 << a->sa_order;
+
+ // page_free_n(s->s_addr, npages);
+ // npages_freed += npages;
+ // } else {
+ // prev = &(s->s_next);
+ // }
+ // /* Check if target was met */
+ // if ((target > 0) && (npages_freed >= target)) {
+ // return npages_freed;
+ // }
+ // s = next;
+ // }
+ // }
+ // spinlock_unlock(&allocator->sa_lock);
+ // return npages_freed;
+}
+
+#define KMALLOC_SIZE_MIN_ORDER (6)
+#define KMALLOC_SIZE_MAX_ORDER (18)
+
+static slab_allocator_t
+ *kmalloc_allocators[KMALLOC_SIZE_MAX_ORDER - KMALLOC_SIZE_MIN_ORDER + 1];
+
+/* Note that kmalloc_allocator_names should be modified to remain consistent
+ * with KMALLOC_SIZE_MIN_ORDER ... KMALLOC_SIZE_MAX_ORDER.
+ */
+static const char *kmalloc_allocator_names[] = {
+ "size-64", "size-128", "size-256", "size-512", "size-1024",
+ "size-2048", "size-4096", "size-8192", "size-16384", "size-32768",
+ "size-65536", "size-131072", "size-262144"};
+
+void *kmalloc(size_t size)
+{
+ size += sizeof(slab_allocator_t *);
+
+ /*
+ * Find the first power of two bucket bigger than the
+ * requested size, and allocate from it.
+ */
+ slab_allocator_t **cs = kmalloc_allocators;
+ for (size_t order = KMALLOC_SIZE_MIN_ORDER; order <= KMALLOC_SIZE_MAX_ORDER;
+ order++, cs++)
+ {
+ if ((1UL << order) >= size)
+ {
+ void *addr = slab_obj_alloc(*cs);
+ if (!addr)
+ {
+ dbg(DBG_MM, "WARNING: kmalloc out of memory\n");
+ return NULL;
+ }
+#ifdef MM_POISON
+ memset(addr, MM_POISON_ALLOC, size);
+#endif /* MM_POISON */
+ *((slab_allocator_t **)addr) = *cs;
+ return (void *)(((slab_allocator_t **)addr) + 1);
+ }
+ }
+
+ panic("size bigger than maxorder %ld\n", size);
+}
+
+__attribute__((used)) static void *malloc(size_t size)
+{
+ /* This function is used by gdb to allocate memory
+ * within the kernel, no code in the kernel should
+ * call it. */
+ return kmalloc(size);
+}
+
+void kfree(void *addr)
+{
+ addr = (void *)(((slab_allocator_t **)addr) - 1);
+ slab_allocator_t *sa = *(slab_allocator_t **)addr;
+
+#ifdef MM_POISON
+ /* If poisoning is enabled, wipe the memory given in
+ * this object, as specified by the cache object size
+ * (minus red-zone overhead, if any).
+ */
+ size_t objsize = sa->sa_objsize;
+#ifdef SLAB_REDZONE
+ objsize -= sizeof(uintptr_t) * 2;
+#endif /* SLAB_REDZONE */
+ memset(addr, MM_POISON_FREE, objsize);
+#endif /* MM_POISON */
+
+ slab_obj_free(sa, addr);
+}
+
+__attribute__((used)) static void free(void *addr)
+{
+ /* This function is used by gdb to free memory allocated
+ * by malloc, no code in the kernel should call it. */
+ kfree(addr);
+}
+
+void slab_init()
+{
+ /* Special case initialization of the allocator for `slab_allocator_t`s */
+ /* In other words, initializes a slab allocator for other slab allocators. */
+ _allocator_init(&slab_allocator_allocator, "slab_allocators",
+ sizeof(slab_allocator_t));
+
+ /*
+ * Allocate the power of two buckets for generic
+ * kmalloc/kfree.
+ */
+ slab_allocator_t **cs = kmalloc_allocators;
+ for (size_t order = KMALLOC_SIZE_MIN_ORDER; order <= KMALLOC_SIZE_MAX_ORDER;
+ order++, cs++)
+ {
+ if (NULL ==
+ (*cs = slab_allocator_create(
+ kmalloc_allocator_names[order - KMALLOC_SIZE_MIN_ORDER],
+ (1UL << order))))
+ {
+ panic("Couldn't create kmalloc allocators!\n");
+ }
+ }
+}
diff --git a/kernel/mm/slab.py b/kernel/mm/slab.py
new file mode 100644
index 0000000..1b0c8fb
--- /dev/null
+++ b/kernel/mm/slab.py
@@ -0,0 +1,55 @@
+import gdb
+
+import weenix
+import weenix.kmem
+
+
+class SlabCommand(weenix.Command):
+ def __init__(self):
+ weenix.Command.__init__(self, "slab", gdb.COMMAND_DATA)
+
+ def _allocators(self):
+ l = list()
+ for alloc in weenix.kmem.allocators():
+ l.append(alloc)
+ return l
+
+ def invoke(self, args, tty):
+ names = list()
+ slabs = list()
+ sizes = list()
+ counts = list()
+
+ names.append("")
+ slabs.append("slabs")
+ sizes.append("objsize")
+ counts.append("allocated")
+
+ for alloc in weenix.kmem.allocators():
+ names.append(alloc.name())
+ slabs.append(str(len(list(alloc.slabs()))))
+ sizes.append(str(alloc.size()))
+ counts.append(str(len(list(alloc.objs()))))
+
+ namewidth = max(map(lambda x: len(x), names))
+ slabwidth = max(map(lambda x: len(x), slabs))
+ sizewidth = max(map(lambda x: len(x), sizes))
+ countwidth = max(map(lambda x: len(x), counts))
+
+ for name, slab, size, count in zip(names, slabs, sizes, counts):
+ print(
+ "{1:<{0}} {3:>{2}} {5:>{4}} {7:>{6}}".format(
+ namewidth, name, slabwidth, slab, sizewidth, size, countwidth, count
+ )
+ )
+
+ def complete(self, line, word):
+ l = map(lambda x: x.name(), self._allocators())
+ l = filter(lambda x: x.startswith(word), l)
+ for used in line.split():
+ l = filter(lambda x: x != used, l)
+ l.sort()
+ return l
+
+
+SlabCommand()