1// SPDX-License-Identifier: GPL-3.0-or-later
2
3#include "mos/mm/mm.hpp"
4
5#include "mos/filesystem/sysfs/sysfs.hpp"
6#include "mos/interrupt/ipi.hpp"
7#include "mos/misc/setup.hpp"
8#include "mos/mm/paging/paging.hpp"
9#include "mos/mm/paging/pmlx/pml5.hpp"
10#include "mos/mm/paging/table_ops.hpp"
11#include "mos/mm/physical/pmm.hpp"
12#include "mos/platform/platform.hpp"
13#include "mos/platform/platform_defs.hpp"
14#include "mos/tasks/signal.hpp"
15
16#include <mos/lib/structures/list.hpp>
17#include <mos/lib/sync/spinlock.hpp>
18#include <mos/mos_global.h>
19#include <mos_stdlib.hpp>
20#include <mos_string.hpp>
21
22#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
23#include "mos/tasks/process.hpp"
24#endif
25
26phyframe_t *mm_get_free_page_raw(void)
27{
28 phyframe_t *frame = pmm_allocate_frames(n_frames: 1, flags: PMM_ALLOC_NORMAL);
29 if (!frame)
30 {
31 mEmerg << "failed to allocate a page";
32 return NULL;
33 }
34
35 return frame;
36}
37
38phyframe_t *mm_get_free_page(void)
39{
40 phyframe_t *frame = mm_get_free_page_raw();
41 if (!frame)
42 return NULL;
43 memzero(s: (void *) phyframe_va(frame), MOS_PAGE_SIZE);
44 return frame;
45}
46
47phyframe_t *mm_get_free_pages(size_t npages)
48{
49 phyframe_t *frame = pmm_allocate_frames(n_frames: npages, flags: PMM_ALLOC_NORMAL);
50 if (!frame)
51 {
52 mEmerg << "failed to allocate " << npages << " pages";
53 return NULL;
54 }
55
56 return frame;
57}
58
59MMContext *mm_create_context(void)
60{
61 MMContext *mmctx = mos::create<MMContext>();
62 linked_list_init(head_node: &mmctx->mmaps);
63
64 pml4_t pml4 = pml_create_table(pml4);
65
66 // map the upper half of the address space to the kernel
67 for (int i = pml4_index(MOS_KERNEL_START_VADDR); i < PML4_ENTRIES; i++)
68 pml4.table[i] = platform_info->kernel_mm->pgd.max.next.table[i];
69
70 mmctx->pgd = pgd_create(pml4);
71
72 return mmctx;
73}
74
75void mm_destroy_context(MMContext *mmctx)
76{
77 MOS_ASSERT(mmctx != platform_info->kernel_mm); // you can't destroy the kernel mmctx
78 MOS_ASSERT(list_is_empty(&mmctx->mmaps));
79
80 ptr_t zero = 0;
81 size_t userspace_npages = (MOS_USER_END_VADDR + 1) / MOS_PAGE_SIZE;
82 const bool freed = pml5_destroy_range(pml5: mmctx->pgd.max, vaddr: &zero, n_pages: &userspace_npages);
83 MOS_ASSERT_X(freed, "failed to free the entire userspace");
84 delete mmctx;
85}
86
87void mm_lock_context_pair(MMContext *ctx1_, MMContext *ctx2_)
88{
89 MMContext *ctx1 = ctx1_;
90 MMContext *ctx2 = ctx2_;
91
92 if (ctx1 > ctx2)
93 std::swap(a&: ctx1, b&: ctx2);
94
95 // ctx1 <= ctx2
96 if (ctx1 == NULL || ctx1 == ctx2)
97 spinlock_acquire(&ctx2->mm_lock);
98 else
99 {
100 spinlock_acquire(&ctx1->mm_lock);
101 spinlock_acquire(&ctx2->mm_lock);
102 }
103}
104
105void mm_unlock_context_pair(MMContext *ctx1_, MMContext *ctx2_)
106{
107 MMContext *ctx1 = ctx1_;
108 MMContext *ctx2 = ctx2_;
109
110 if (ctx1 > ctx2)
111 std::swap(a&: ctx1, b&: ctx2);
112
113 // ctx1 <= ctx2
114 if (ctx1 == NULL || ctx1 == ctx2)
115 spinlock_release(&ctx2->mm_lock);
116 else
117 {
118 // note that we release in reverse order
119 spinlock_release(&ctx2->mm_lock);
120 spinlock_release(&ctx1->mm_lock);
121 }
122}
123
124MMContext *mm_switch_context(MMContext *new_ctx)
125{
126 MMContext *old_ctx = current_cpu->mm_context;
127 if (old_ctx == new_ctx)
128 return old_ctx;
129
130 platform_switch_mm(new_mm: new_ctx);
131 current_cpu->mm_context = new_ctx;
132 return old_ctx;
133}
134
135static void do_attach_vmap(MMContext *mmctx, vmap_t *vmap)
136{
137 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
138 MOS_ASSERT_X(list_is_empty(list_node(vmap)), "vmap is already attached to something");
139 MOS_ASSERT(vmap->mmctx == NULL || vmap->mmctx == mmctx);
140
141 vmap->mmctx = mmctx;
142
143 // add to the list, sorted by address
144 list_foreach(vmap_t, m, mmctx->mmaps)
145 {
146 if (m->vaddr > vmap->vaddr)
147 {
148 list_insert_before(m, vmap);
149 return;
150 }
151 }
152
153 list_node_append(head: &mmctx->mmaps, list_node(vmap)); // append at the end
154}
155
156vmap_t *vmap_create(MMContext *mmctx, ptr_t vaddr, size_t npages)
157{
158 MOS_ASSERT_X(mmctx != platform_info->kernel_mm, "you can't create vmaps in the kernel mmctx");
159 vmap_t *map = mos::create<vmap_t>();
160 linked_list_init(list_node(map));
161 spinlock_acquire(&map->lock);
162 map->vaddr = vaddr;
163 map->npages = npages;
164 do_attach_vmap(mmctx, vmap: map);
165 return map;
166}
167
168void vmap_destroy(vmap_t *vmap)
169{
170 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
171 MMContext *const mm = vmap->mmctx;
172 MOS_ASSERT(spinlock_is_locked(&mm->mm_lock));
173 if (vmap->io)
174 {
175 bool unmapped = false;
176 if (!vmap->io->unmap(vmap, unmapped: &unmapped))
177 mWarn << "munmap: could not unmap the file: io_munmap() failed";
178
179 if (unmapped)
180 goto unmapped;
181 }
182 mm_do_unmap(top: mm->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: true);
183
184unmapped:
185 list_remove(vmap);
186 delete vmap;
187}
188
189vmap_t *vmap_obtain(MMContext *mmctx, ptr_t vaddr, size_t *out_offset)
190{
191 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
192
193 list_foreach(vmap_t, m, mmctx->mmaps)
194 {
195 if (m->vaddr <= vaddr && vaddr < m->vaddr + m->npages * MOS_PAGE_SIZE)
196 {
197 spinlock_acquire(&m->lock);
198 if (out_offset)
199 *out_offset = vaddr - m->vaddr;
200 return m;
201 }
202 }
203
204 if (out_offset)
205 *out_offset = 0;
206 return NULL;
207}
208
209vmap_t *vmap_split(vmap_t *first, size_t split)
210{
211 MOS_ASSERT(spinlock_is_locked(&first->lock));
212 MOS_ASSERT(split && split < first->npages);
213
214 vmap_t *second = mos::create<vmap_t>();
215 *second = *first; // copy the whole structure
216 linked_list_init(list_node(second)); // except for the list node
217
218 first->npages = split; // shrink the first vmap
219 second->npages -= split;
220 second->vaddr += split * MOS_PAGE_SIZE;
221 if (first->io)
222 {
223 second->io = first->io->ref(); // ref the io again
224 second->io_offset += split * MOS_PAGE_SIZE;
225 }
226
227 do_attach_vmap(mmctx: first->mmctx, vmap: second);
228 return second;
229}
230
231vmap_t *vmap_split_for_range(vmap_t *vmap, size_t rstart_pgoff, size_t rend_pgoff)
232{
233 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
234
235 /// |-------|-------|-------|
236 /// |begin |rstart |rend |end
237 /// |-------|-------|-------|
238
239 if (rstart_pgoff == 0 && rend_pgoff == vmap->npages)
240 return vmap;
241
242 if (rstart_pgoff == 0)
243 return vmap_split(first: vmap, split: rend_pgoff);
244
245 if (rend_pgoff == vmap->npages)
246 return vmap_split(first: vmap, split: rstart_pgoff);
247
248 vmap_t *second = vmap_split(first: vmap, split: rstart_pgoff);
249 vmap_t *third = vmap_split(first: second, split: rend_pgoff - rstart_pgoff);
250 spinlock_release(&third->lock);
251 return second;
252}
253
254void vmap_finalise_init(vmap_t *vmap, vmap_content_t content, vmap_type_t type)
255{
256 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
257 MOS_ASSERT_X(content != VMAP_UNKNOWN, "vmap content cannot be unknown");
258 MOS_ASSERT_X(vmap->content == VMAP_UNKNOWN || vmap->content == content, "vmap is already setup");
259
260 vmap->content = content;
261 vmap->type = type;
262 spinlock_release(&vmap->lock);
263}
264
265void mm_copy_page(const phyframe_t *src, const phyframe_t *dst)
266{
267 memcpy(dest: (void *) phyframe_va(dst), src: (void *) phyframe_va(src), MOS_PAGE_SIZE);
268}
269
270vmfault_result_t mm_resolve_cow_fault(vmap_t *vmap, ptr_t fault_addr, pagefault_t *info)
271{
272 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
273 MOS_ASSERT(info->is_write && info->is_present);
274
275 // fast path to handle CoW
276 phyframe_t *page = mm_get_free_page();
277 mm_copy_page(src: info->faulting_page, dst: page);
278 mm_replace_page_locked(mmctx: vmap->mmctx, vaddr: fault_addr, phyframe_pfn(page), flags: vmap->vmflags);
279
280 return VMFAULT_COMPLETE;
281}
282
283static void invalid_page_fault(ptr_t fault_addr, vmap_t *faulting_vmap, vmap_t *ip_vmap, pagefault_t *info, const char *unhandled_reason)
284{
285 mEmerg << "unhandled page fault: " << unhandled_reason;
286#if MOS_CONFIG(MOS_MM_DETAILED_UNHANDLED_FAULT)
287 mEmerg << " invalid " //
288 << (info->is_user ? "user" : "kernel") << " mode " //
289 << (info->is_write ? "write to" : (info->is_exec ? "execute in" : "read from")) //
290 << " " << (info->is_present ? "present" : "non-present") << " page [" << (void *) fault_addr << "]";
291
292 mEmerg << " instruction: " << (void *) info->ip;
293 if (ip_vmap)
294 {
295 mEmerg << " vmap: " << ip_vmap;
296 mEmerg << " offset: 0x" << (info->ip - ip_vmap->vaddr + (ip_vmap->io ? ip_vmap->io_offset : 0));
297 }
298
299 mEmerg << " thread: " << current_thread;
300 mEmerg << " process: " << (current_thread ? current_process : nullptr);
301
302 if (fault_addr < 1 KB)
303 {
304 if (info->is_write)
305 mEmerg << " possible write to NULL pointer";
306 else if (info->is_exec && fault_addr == 0)
307 mEmerg << " attempted to execute NULL pointer";
308 else
309 mEmerg << " possible NULL pointer dereference";
310 }
311
312 if (info->is_user && fault_addr > MOS_KERNEL_START_VADDR)
313 mEmerg << " kernel address dereference";
314
315 if (info->ip > MOS_KERNEL_START_VADDR)
316 mEmerg << " in kernel function " << (void *) info->ip;
317
318 if (faulting_vmap)
319 {
320 mEmerg << " in vmap: " << faulting_vmap;
321 mEmerg << " offset: 0x" << (fault_addr - faulting_vmap->vaddr + (faulting_vmap->io ? faulting_vmap->io_offset : 0));
322 }
323#endif
324
325 if (faulting_vmap)
326 spinlock_release(&faulting_vmap->lock);
327
328 if (ip_vmap)
329 spinlock_release(&ip_vmap->lock);
330
331 if (current_thread)
332 spinlock_release(&current_thread->owner->mm->mm_lock);
333
334#if MOS_CONFIG(MOS_MM_DETAILED_UNHANDLED_FAULT)
335#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
336 if (current_thread)
337 process_dump_mmaps(current_process);
338#endif
339
340 mInfo << "stack trace before fault (may be unreliable):";
341 platform_dump_stack(regs: info->regs);
342
343 mInfo << "register states before fault:";
344 platform_dump_regs(regs: info->regs);
345 mCont << "\n";
346#else
347 MOS_UNUSED(faulting_vmap);
348 MOS_UNUSED(ip_vmap);
349 MOS_UNUSED(fault_addr);
350 MOS_UNUSED(info);
351#endif
352
353 if (current_thread)
354 {
355 signal_send_to_thread(current_thread, SIGSEGV);
356 }
357 else
358 {
359 MOS_ASSERT(!"unhandled kernel page fault");
360 }
361}
362
363void mm_handle_fault(ptr_t fault_addr, pagefault_t *info)
364{
365 const char *unhandled_reason = NULL;
366
367 dEmph<pagefault> << (info->is_user ? "user" : "kernel") << " #PF: " //
368 << (current_thread ? current_thread : NULL) << ", " //
369 << (current_thread ? current_thread->owner : NULL) //
370 << ", IP=" << info->ip //
371 << ", ADDR=" << fault_addr;
372
373 if (info->is_write && info->is_exec)
374 mos_panic("Cannot write and execute at the same time");
375
376 size_t offset = 0;
377 vmap_t *fault_vmap = NULL;
378 vmap_t *ip_vmap = NULL;
379
380 const auto DoUnhandledPageFault = [&]()
381 {
382 // if we get here, the fault was not handled
383 MOS_ASSERT_X(unhandled_reason, "unhandled fault with no reason");
384 invalid_page_fault(fault_addr, faulting_vmap: fault_vmap, ip_vmap, info, unhandled_reason);
385 };
386
387 if (!current_mm)
388 {
389 unhandled_reason = "no mm context";
390 return DoUnhandledPageFault();
391 }
392
393 MMContext *const mm = current_mm;
394 mm_lock_context_pair(ctx1_: mm);
395
396 fault_vmap = vmap_obtain(mmctx: mm, vaddr: fault_addr, out_offset: &offset);
397 if (!fault_vmap)
398 {
399 ip_vmap = vmap_obtain(mmctx: mm, vaddr: info->ip);
400 unhandled_reason = "page fault in unmapped area";
401 mm_unlock_context_pair(ctx1_: mm);
402 return DoUnhandledPageFault();
403 }
404 ip_vmap = MOS_IN_RANGE(info->ip, fault_vmap->vaddr, fault_vmap->vaddr + fault_vmap->npages * MOS_PAGE_SIZE) ? fault_vmap : vmap_obtain(mmctx: mm, vaddr: info->ip);
405
406 MOS_ASSERT_X(fault_vmap->on_fault, "vmap %pvm has no fault handler", (void *) fault_vmap);
407 const VMFlags page_flags = mm_do_get_flags(max: fault_vmap->mmctx->pgd, vaddr: fault_addr);
408
409 if (info->is_exec && !(fault_vmap->vmflags & VM_EXEC))
410 {
411 unhandled_reason = "page fault in non-executable vmap";
412 mm_unlock_context_pair(ctx1_: mm);
413 return DoUnhandledPageFault();
414 }
415 else if (info->is_present && info->is_exec && fault_vmap->vmflags & VM_EXEC && !(page_flags & VM_EXEC))
416 {
417 // vmprotect has been called on this vmap to enable execution
418 // we need to make sure that the page is executable
419 mm_do_flag(top: fault_vmap->mmctx->pgd, vaddr: fault_addr, n_pages: 1, flags: page_flags | VM_EXEC);
420 mm_unlock_context_pair(ctx1_: mm, NULL);
421 spinlock_release(&fault_vmap->lock);
422 if (ip_vmap != fault_vmap && ip_vmap)
423 spinlock_release(&ip_vmap->lock);
424 return;
425 }
426
427 if (info->is_write && !fault_vmap->vmflags.test(b: VM_WRITE))
428 {
429 unhandled_reason = "page fault in read-only vmap";
430 mm_unlock_context_pair(ctx1_: mm, NULL);
431 return DoUnhandledPageFault();
432 }
433
434 if (info->is_present)
435 info->faulting_page = pfn_phyframe(mm_do_get_pfn(fault_vmap->mmctx->pgd, fault_addr));
436
437 const auto get_fault_result = [](vmfault_result_t result)
438 {
439 switch (result)
440 {
441 case VMFAULT_COMPLETE: return "COMPLETE";
442 case VMFAULT_MAP_BACKING_PAGE_RO: return "MAP_BACKING_PAGE_RO";
443 case VMFAULT_MAP_BACKING_PAGE: return "MAP_BACKING_PAGE";
444 case VMFAULT_COPY_BACKING_PAGE: return "COPY_BACKING_PAGE";
445 case VMFAULT_CANNOT_HANDLE: return "CANNOT_HANDLE";
446 default: return "UNKNOWN";
447 };
448 };
449
450 dCont<pagefault> << ", handler " << (void *) (ptr_t) fault_vmap->on_fault;
451 vmfault_result_t fault_result = fault_vmap->on_fault(fault_vmap, fault_addr, info);
452 dCont<pagefault> << " -> " << get_fault_result(fault_result);
453
454 VMFlags map_flags = fault_vmap->vmflags;
455 switch (fault_result)
456 {
457 case VMFAULT_COMPLETE: break;
458 case VMFAULT_CANNOT_HANDLE:
459 {
460 unhandled_reason = "vmap fault handler returned VMFAULT_CANNOT_HANDLE";
461 return DoUnhandledPageFault();
462 }
463 case VMFAULT_COPY_BACKING_PAGE:
464 {
465 MOS_ASSERT(info->backing_page);
466 const phyframe_t *page = mm_get_free_page(); // will be ref'd by mm_replace_page_locked()
467 mm_copy_page(src: info->backing_page, dst: page);
468 info->backing_page = page;
469 goto map_backing_page;
470 }
471 case VMFAULT_MAP_BACKING_PAGE_RO:
472 {
473 map_flags.erase(b: VM_WRITE);
474 goto map_backing_page;
475 }
476 case VMFAULT_MAP_BACKING_PAGE:
477 {
478 map_backing_page:
479 if (!info->backing_page)
480 {
481 unhandled_reason = "out of memory";
482 mm_unlock_context_pair(ctx1_: mm, NULL);
483 return DoUnhandledPageFault();
484 }
485
486 dCont<pagefault> << " (backing page: " << phyframe_pfn(info->backing_page) << ")";
487 mm_replace_page_locked(mmctx: fault_vmap->mmctx, vaddr: fault_addr, phyframe_pfn(info->backing_page), flags: map_flags);
488 fault_result = VMFAULT_COMPLETE;
489 break;
490 }
491 }
492
493 MOS_ASSERT_X(fault_result == VMFAULT_COMPLETE || fault_result == VMFAULT_CANNOT_HANDLE, "invalid fault result %d", fault_result);
494 if (ip_vmap)
495 spinlock_release(&ip_vmap->lock);
496 if (fault_vmap != ip_vmap)
497 spinlock_release(&fault_vmap->lock);
498 mm_unlock_context_pair(ctx1_: mm, NULL);
499 ipi_send_all(type: IPI_TYPE_INVALIDATE_TLB);
500 if (fault_result == VMFAULT_COMPLETE)
501 return;
502
503 DoUnhandledPageFault();
504}
505
506// ! sysfs support
507
508static bool sys_mem_mmap(sysfs_file_t *f, vmap_t *vmap, off_t offset)
509{
510 MOS_UNUSED(f);
511 // mInfo << "mem: mapping " << vmap->vaddr << " to " << offset << "\n";
512 mm_do_map(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, pfn: offset / MOS_PAGE_SIZE, n_pages: vmap->npages, flags: vmap->vmflags, do_refcount: false);
513 return true;
514}
515
516static bool sys_mem_munmap(sysfs_file_t *f, vmap_t *vmap, bool *unmapped)
517{
518 MOS_UNUSED(f);
519 mm_do_unmap(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: false);
520 *unmapped = true;
521 return true;
522}
523
524static sysfs_item_t sys_mem_item = SYSFS_MEM_ITEM("mem", sys_mem_mmap, sys_mem_munmap);
525
526static void mm_sysfs_init()
527{
528 sys_mem_item.mem.size = platform_info->max_pfn * MOS_PAGE_SIZE;
529 sysfs_register_root_file(item: &sys_mem_item);
530}
531
532MOS_INIT(SYSFS, mm_sysfs_init);
533