1// SPDX-License-Identifier: GPL-3.0-or-later
2
3#include "mos/mm/mm.hpp"
4
5#include "mos/filesystem/sysfs/sysfs.hpp"
6#include "mos/interrupt/ipi.hpp"
7#include "mos/misc/setup.hpp"
8#include "mos/mm/paging/paging.hpp"
9#include "mos/mm/paging/pmlx/pml5.hpp"
10#include "mos/mm/paging/table_ops.hpp"
11#include "mos/mm/physical/pmm.hpp"
12#include "mos/platform/platform.hpp"
13#include "mos/platform/platform_defs.hpp"
14#include "mos/syslog/printk.hpp"
15#include "mos/tasks/signal.hpp"
16
17#include <mos/lib/structures/list.hpp>
18#include <mos/lib/sync/spinlock.hpp>
19#include <mos/mos_global.h>
20#include <mos_stdlib.hpp>
21#include <mos_string.hpp>
22
23#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
24#include "mos/tasks/process.hpp"
25#endif
26
27phyframe_t *mm_get_free_page_raw(void)
28{
29 phyframe_t *frame = pmm_allocate_frames(n_frames: 1, flags: PMM_ALLOC_NORMAL);
30 if (!frame)
31 {
32 pr_emerg("failed to allocate a page");
33 return NULL;
34 }
35
36 return frame;
37}
38
39phyframe_t *mm_get_free_page(void)
40{
41 phyframe_t *frame = mm_get_free_page_raw();
42 if (!frame)
43 return NULL;
44 memzero(s: (void *) phyframe_va(frame), MOS_PAGE_SIZE);
45 return frame;
46}
47
48phyframe_t *mm_get_free_pages(size_t npages)
49{
50 phyframe_t *frame = pmm_allocate_frames(n_frames: npages, flags: PMM_ALLOC_NORMAL);
51 if (!frame)
52 {
53 pr_emerg("failed to allocate %zd pages", npages);
54 return NULL;
55 }
56
57 return frame;
58}
59
60MMContext *mm_create_context(void)
61{
62 MMContext *mmctx = mos::create<MMContext>();
63 linked_list_init(head_node: &mmctx->mmaps);
64
65 pml4_t pml4 = pml_create_table(pml4);
66
67 // map the upper half of the address space to the kernel
68 for (int i = pml4_index(MOS_KERNEL_START_VADDR); i < PML4_ENTRIES; i++)
69 pml4.table[i] = platform_info->kernel_mm->pgd.max.next.table[i];
70
71 mmctx->pgd = pgd_create(pml4);
72
73 return mmctx;
74}
75
76void mm_destroy_context(MMContext *mmctx)
77{
78 MOS_ASSERT(mmctx != platform_info->kernel_mm); // you can't destroy the kernel mmctx
79 MOS_ASSERT(list_is_empty(&mmctx->mmaps));
80
81 ptr_t zero = 0;
82 size_t userspace_npages = (MOS_USER_END_VADDR + 1) / MOS_PAGE_SIZE;
83 const bool freed = pml5_destroy_range(pml5: mmctx->pgd.max, vaddr: &zero, n_pages: &userspace_npages);
84 MOS_ASSERT_X(freed, "failed to free the entire userspace");
85 delete mmctx;
86}
87
88void mm_lock_ctx_pair(MMContext *ctx1, MMContext *ctx2)
89{
90 if (ctx1 == ctx2 || ctx2 == NULL)
91 spinlock_acquire(&ctx1->mm_lock);
92 else if (ctx1 < ctx2)
93 {
94 spinlock_acquire(&ctx1->mm_lock);
95 spinlock_acquire(&ctx2->mm_lock);
96 }
97 else
98 {
99 spinlock_acquire(&ctx2->mm_lock);
100 spinlock_acquire(&ctx1->mm_lock);
101 }
102}
103
104void mm_unlock_ctx_pair(MMContext *ctx1, MMContext *ctx2)
105{
106 if (ctx1 == ctx2 || ctx2 == NULL)
107 spinlock_release(&ctx1->mm_lock);
108 else if (ctx1 < ctx2)
109 {
110 spinlock_release(&ctx2->mm_lock);
111 spinlock_release(&ctx1->mm_lock);
112 }
113 else
114 {
115 spinlock_release(&ctx1->mm_lock);
116 spinlock_release(&ctx2->mm_lock);
117 }
118}
119
120MMContext *mm_switch_context(MMContext *new_ctx)
121{
122 MMContext *old_ctx = current_cpu->mm_context;
123 if (old_ctx == new_ctx)
124 return old_ctx;
125
126 platform_switch_mm(new_mm: new_ctx);
127 current_cpu->mm_context = new_ctx;
128 return old_ctx;
129}
130
131static void do_attach_vmap(MMContext *mmctx, vmap_t *vmap)
132{
133 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
134 MOS_ASSERT_X(list_is_empty(list_node(vmap)), "vmap is already attached to something");
135 MOS_ASSERT(vmap->mmctx == NULL || vmap->mmctx == mmctx);
136
137 vmap->mmctx = mmctx;
138
139 // add to the list, sorted by address
140 list_foreach(vmap_t, m, mmctx->mmaps)
141 {
142 if (m->vaddr > vmap->vaddr)
143 {
144 list_insert_before(m, vmap);
145 return;
146 }
147 }
148
149 list_node_append(head: &mmctx->mmaps, list_node(vmap)); // append at the end
150}
151
152vmap_t *vmap_create(MMContext *mmctx, ptr_t vaddr, size_t npages)
153{
154 MOS_ASSERT_X(mmctx != platform_info->kernel_mm, "you can't create vmaps in the kernel mmctx");
155 vmap_t *map = mos::create<vmap_t>();
156 linked_list_init(list_node(map));
157 spinlock_acquire(&map->lock);
158 map->vaddr = vaddr;
159 map->npages = npages;
160 do_attach_vmap(mmctx, vmap: map);
161 return map;
162}
163
164void vmap_destroy(vmap_t *vmap)
165{
166 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
167 MMContext *const mm = vmap->mmctx;
168 MOS_ASSERT(spinlock_is_locked(&mm->mm_lock));
169 if (vmap->io)
170 {
171 bool unmapped = false;
172 if (!io_munmap(io: vmap->io, vmap, unmapped: &unmapped))
173 pr_warn("munmap: could not unmap the file: io_munmap() failed");
174
175 if (unmapped)
176 goto unmapped;
177 }
178 mm_do_unmap(top: mm->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: true);
179
180unmapped:
181 list_remove(vmap);
182 delete vmap;
183}
184
185vmap_t *vmap_obtain(MMContext *mmctx, ptr_t vaddr, size_t *out_offset)
186{
187 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
188
189 list_foreach(vmap_t, m, mmctx->mmaps)
190 {
191 if (m->vaddr <= vaddr && vaddr < m->vaddr + m->npages * MOS_PAGE_SIZE)
192 {
193 spinlock_acquire(&m->lock);
194 if (out_offset)
195 *out_offset = vaddr - m->vaddr;
196 return m;
197 }
198 }
199
200 if (out_offset)
201 *out_offset = 0;
202 return NULL;
203}
204
205vmap_t *vmap_split(vmap_t *first, size_t split)
206{
207 MOS_ASSERT(spinlock_is_locked(&first->lock));
208 MOS_ASSERT(split && split < first->npages);
209
210 vmap_t *second = mos::create<vmap_t>();
211 *second = *first; // copy the whole structure
212 linked_list_init(list_node(second)); // except for the list node
213
214 first->npages = split; // shrink the first vmap
215 second->npages -= split;
216 second->vaddr += split * MOS_PAGE_SIZE;
217 if (first->io)
218 {
219 second->io = io_ref(io: first->io); // ref the io again
220 second->io_offset += split * MOS_PAGE_SIZE;
221 }
222
223 do_attach_vmap(mmctx: first->mmctx, vmap: second);
224 return second;
225}
226
227vmap_t *vmap_split_for_range(vmap_t *vmap, size_t rstart_pgoff, size_t rend_pgoff)
228{
229 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
230
231 /// |-------|-------|-------|
232 /// |begin |rstart |rend |end
233 /// |-------|-------|-------|
234
235 if (rstart_pgoff == 0 && rend_pgoff == vmap->npages)
236 return vmap;
237
238 if (rstart_pgoff == 0)
239 return vmap_split(first: vmap, split: rend_pgoff);
240
241 if (rend_pgoff == vmap->npages)
242 return vmap_split(first: vmap, split: rstart_pgoff);
243
244 vmap_t *second = vmap_split(first: vmap, split: rstart_pgoff);
245 vmap_t *third = vmap_split(first: second, split: rend_pgoff - rstart_pgoff);
246 spinlock_release(&third->lock);
247 return second;
248}
249
250void vmap_finalise_init(vmap_t *vmap, vmap_content_t content, vmap_type_t type)
251{
252 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
253 MOS_ASSERT_X(content != VMAP_UNKNOWN, "vmap content cannot be unknown");
254 MOS_ASSERT_X(vmap->content == VMAP_UNKNOWN || vmap->content == content, "vmap is already setup");
255
256 vmap->content = content;
257 vmap->type = type;
258 spinlock_release(&vmap->lock);
259}
260
261void mm_copy_page(const phyframe_t *src, const phyframe_t *dst)
262{
263 memcpy(dest: (void *) phyframe_va(dst), src: (void *) phyframe_va(src), MOS_PAGE_SIZE);
264}
265
266vmfault_result_t mm_resolve_cow_fault(vmap_t *vmap, ptr_t fault_addr, pagefault_t *info)
267{
268 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
269 MOS_ASSERT(info->is_write && info->is_present);
270
271 // fast path to handle CoW
272 phyframe_t *page = mm_get_free_page();
273 mm_copy_page(src: info->faulting_page, dst: page);
274 mm_replace_page_locked(mmctx: vmap->mmctx, vaddr: fault_addr, phyframe_pfn(page), flags: vmap->vmflags);
275
276 return VMFAULT_COMPLETE;
277}
278
279static void invalid_page_fault(ptr_t fault_addr, vmap_t *faulting_vmap, vmap_t *ip_vmap, pagefault_t *info, const char *unhandled_reason)
280{
281 pr_emerg("unhandled page fault: %s", unhandled_reason);
282#if MOS_CONFIG(MOS_MM_DETAILED_UNHANDLED_FAULT)
283 pr_emerg(" invalid %s mode %s %s page [" PTR_FMT "]", //
284 info->is_user ? "user" : "kernel", //
285 info->is_write ? "write to" : (info->is_exec ? "execute in" : "read from"), //
286 info->is_present ? "present" : "non-present", //
287 fault_addr //
288 );
289
290 pr_emerg(" instruction: " PTR_FMT, info->ip);
291 if (ip_vmap)
292 {
293 pr_emerg(" vmap: %pvm", (void *) ip_vmap);
294 pr_emerg(" offset: 0x%zx", info->ip - ip_vmap->vaddr + (ip_vmap->io ? ip_vmap->io_offset : 0));
295 }
296
297 pr_emerg(" thread: %pt", current_thread);
298 pr_emerg(" process: %pp", current_thread ? current_process : nullptr);
299
300 if (fault_addr < 1 KB)
301 {
302 if (info->is_write)
303 pr_emerg(" possible write to NULL pointer");
304 else if (info->is_exec && fault_addr == 0)
305 pr_emerg(" attempted to execute NULL pointer");
306 else
307 pr_emerg(" possible NULL pointer dereference");
308 }
309
310 if (info->is_user && fault_addr > MOS_KERNEL_START_VADDR)
311 pr_emerg(" kernel address dereference");
312
313 if (info->ip > MOS_KERNEL_START_VADDR)
314 pr_emerg(" in kernel function %ps", (void *) info->ip);
315
316 if (faulting_vmap)
317 {
318 pr_emerg(" in vmap: %pvm", (void *) faulting_vmap);
319 pr_emerg(" offset: 0x%zx", fault_addr - faulting_vmap->vaddr + (faulting_vmap->io ? faulting_vmap->io_offset : 0));
320 }
321
322 if (faulting_vmap)
323 spinlock_release(&faulting_vmap->lock);
324
325 if (ip_vmap)
326 spinlock_release(&ip_vmap->lock);
327
328 if (current_thread)
329 spinlock_release(&current_thread->owner->mm->mm_lock);
330
331#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
332 if (current_thread)
333 process_dump_mmaps(current_process);
334#endif
335
336 pr_info("stack trace before fault (may be unreliable):");
337 platform_dump_stack(regs: info->regs);
338
339 pr_info("register states before fault:");
340 platform_dump_regs(regs: info->regs);
341 pr_cont("\n");
342#else
343 MOS_UNUSED(fault_addr);
344 MOS_UNUSED(info);
345#endif
346
347 if (current_thread)
348 {
349 signal_send_to_thread(current_thread, SIGSEGV);
350 signal_exit_to_user_prepare(regs: info->regs);
351 }
352 else
353 {
354 MOS_ASSERT(!"unhandled kernel page fault");
355 }
356}
357
358void mm_handle_fault(ptr_t fault_addr, pagefault_t *info)
359{
360 const char *unhandled_reason = NULL;
361
362 pr_demph(pagefault, "%s #PF: %pt, %pp, IP=" PTR_VLFMT ", ADDR=" PTR_VLFMT, //
363 info->is_user ? "user" : "kernel", //
364 current_thread ? current_thread : NULL, //
365 current_thread ? current_thread->owner : NULL, //
366 info->ip, //
367 fault_addr //
368 );
369
370 if (info->is_write && info->is_exec)
371 mos_panic("Cannot write and execute at the same time");
372
373 size_t offset = 0;
374 vmap_t *fault_vmap = NULL;
375 vmap_t *ip_vmap = NULL;
376
377 const auto DoUnhandledPageFault = [&]()
378 {
379 // if we get here, the fault was not handled
380 MOS_ASSERT_X(unhandled_reason, "unhandled fault with no reason");
381 invalid_page_fault(fault_addr, faulting_vmap: fault_vmap, ip_vmap, info, unhandled_reason);
382 };
383
384 if (!current_mm)
385 {
386 unhandled_reason = "no mm context";
387 DoUnhandledPageFault();
388 return;
389 }
390
391 MMContext *const mm = current_mm;
392 mm_lock_ctx_pair(ctx1: mm, NULL);
393
394 fault_vmap = vmap_obtain(mmctx: mm, vaddr: fault_addr, out_offset: &offset);
395 if (!fault_vmap)
396 {
397 ip_vmap = vmap_obtain(mmctx: mm, vaddr: info->ip, NULL);
398 unhandled_reason = "page fault in unmapped area";
399 mm_unlock_ctx_pair(ctx1: mm, NULL);
400 DoUnhandledPageFault();
401 return;
402 }
403 ip_vmap = MOS_IN_RANGE(info->ip, fault_vmap->vaddr, fault_vmap->vaddr + fault_vmap->npages * MOS_PAGE_SIZE) ? fault_vmap : vmap_obtain(mmctx: mm, vaddr: info->ip, NULL);
404
405 MOS_ASSERT_X(fault_vmap->on_fault, "vmap %pvm has no fault handler", (void *) fault_vmap);
406 const vm_flags page_flags = mm_do_get_flags(max: fault_vmap->mmctx->pgd, vaddr: fault_addr);
407
408 if (info->is_exec && !(fault_vmap->vmflags & VM_EXEC))
409 {
410 unhandled_reason = "page fault in non-executable vmap";
411 mm_unlock_ctx_pair(ctx1: mm, NULL);
412 DoUnhandledPageFault();
413 return;
414 }
415 else if (info->is_present && info->is_exec && fault_vmap->vmflags & VM_EXEC && !(page_flags & VM_EXEC))
416 {
417 // vmprotect has been called on this vmap to enable execution
418 // we need to make sure that the page is executable
419 mm_do_flag(top: fault_vmap->mmctx->pgd, vaddr: fault_addr, n_pages: 1, flags: page_flags | VM_EXEC);
420 mm_unlock_ctx_pair(ctx1: mm, NULL);
421 spinlock_release(&fault_vmap->lock);
422 if (ip_vmap)
423 spinlock_release(&ip_vmap->lock);
424 return;
425 }
426
427 if (info->is_write && !(fault_vmap->vmflags & VM_WRITE))
428 {
429 unhandled_reason = "page fault in read-only vmap";
430 mm_unlock_ctx_pair(ctx1: mm, NULL);
431 DoUnhandledPageFault();
432 return;
433 }
434
435 if (info->is_present)
436 info->faulting_page = pfn_phyframe(mm_do_get_pfn(fault_vmap->mmctx->pgd, fault_addr));
437
438 const auto get_fault_result = [](vmfault_result_t result)
439 {
440 switch (result)
441 {
442 case VMFAULT_COMPLETE: return "COMPLETE";
443 case VMFAULT_MAP_BACKING_PAGE_RO: return "MAP_BACKING_PAGE_RO";
444 case VMFAULT_MAP_BACKING_PAGE: return "MAP_BACKING_PAGE";
445 case VMFAULT_COPY_BACKING_PAGE: return "COPY_BACKING_PAGE";
446 case VMFAULT_CANNOT_HANDLE: return "CANNOT_HANDLE";
447 default: return "UNKNOWN";
448 };
449 };
450
451 pr_dcont(pagefault, ", handler %ps", (void *) (ptr_t) fault_vmap->on_fault);
452 vmfault_result_t fault_result = fault_vmap->on_fault(fault_vmap, fault_addr, info);
453 pr_dcont(pagefault, " -> %s", get_fault_result(fault_result));
454
455 vm_flags map_flags = fault_vmap->vmflags;
456 switch (fault_result)
457 {
458 case VMFAULT_COMPLETE: break;
459 case VMFAULT_CANNOT_HANDLE:
460 {
461 unhandled_reason = "vmap fault handler returned VMFAULT_CANNOT_HANDLE";
462 DoUnhandledPageFault();
463 return;
464 }
465 case VMFAULT_COPY_BACKING_PAGE:
466 {
467 MOS_ASSERT(info->backing_page);
468 const phyframe_t *page = mm_get_free_page(); // will be ref'd by mm_replace_page_locked()
469 mm_copy_page(src: info->backing_page, dst: page);
470 info->backing_page = page;
471 goto map_backing_page;
472 }
473 case VMFAULT_MAP_BACKING_PAGE_RO:
474 {
475 map_flags &= ~VM_WRITE;
476 goto map_backing_page;
477 }
478 case VMFAULT_MAP_BACKING_PAGE:
479 {
480 map_backing_page:
481 if (!info->backing_page)
482 {
483 unhandled_reason = "out of memory";
484 mm_unlock_ctx_pair(ctx1: mm, NULL);
485 DoUnhandledPageFault();
486 return;
487 }
488
489 pr_dcont(pagefault, " (backing page: " PFN_FMT ")", phyframe_pfn(info->backing_page));
490 mm_replace_page_locked(mmctx: fault_vmap->mmctx, vaddr: fault_addr, phyframe_pfn(info->backing_page), flags: map_flags);
491 fault_result = VMFAULT_COMPLETE;
492 }
493 }
494
495 MOS_ASSERT_X(fault_result == VMFAULT_COMPLETE || fault_result == VMFAULT_CANNOT_HANDLE, "invalid fault result %d", fault_result);
496 if (ip_vmap)
497 spinlock_release(&ip_vmap->lock);
498 spinlock_release(&fault_vmap->lock);
499 mm_unlock_ctx_pair(ctx1: mm, NULL);
500 ipi_send_all(type: IPI_TYPE_INVALIDATE_TLB);
501 if (fault_result == VMFAULT_COMPLETE)
502 return;
503
504 DoUnhandledPageFault();
505}
506
507// ! sysfs support
508
509static bool sys_mem_mmap(sysfs_file_t *f, vmap_t *vmap, off_t offset)
510{
511 MOS_UNUSED(f);
512 // pr_info("mem: mapping " PTR_VLFMT " to " PTR_VLFMT "\n", vmap->vaddr, offset);
513 mm_do_map(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, pfn: offset / MOS_PAGE_SIZE, n_pages: vmap->npages, flags: vmap->vmflags, do_refcount: false);
514 return true;
515}
516
517static bool sys_mem_munmap(sysfs_file_t *f, vmap_t *vmap, bool *unmapped)
518{
519 MOS_UNUSED(f);
520 mm_do_unmap(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: false);
521 *unmapped = true;
522 return true;
523}
524
525static sysfs_item_t sys_mem_item = SYSFS_MEM_ITEM("mem", sys_mem_mmap, sys_mem_munmap);
526
527static void mm_sysfs_init()
528{
529 sys_mem_item.mem.size = platform_info->max_pfn * MOS_PAGE_SIZE;
530 sysfs_register_root_file(item: &sys_mem_item);
531}
532
533MOS_INIT(SYSFS, mm_sysfs_init);
534