1// SPDX-License-Identifier: GPL-3.0-or-later
2
3#include "mos/mm/mm.h"
4
5#include "mos/filesystem/sysfs/sysfs.h"
6#include "mos/interrupt/ipi.h"
7#include "mos/mm/paging/paging.h"
8#include "mos/mm/paging/pmlx/pml5.h"
9#include "mos/mm/paging/table_ops.h"
10#include "mos/mm/physical/pmm.h"
11#include "mos/mm/slab_autoinit.h"
12#include "mos/platform/platform.h"
13#include "mos/platform/platform_defs.h"
14#include "mos/syslog/printk.h"
15#include "mos/tasks/signal.h"
16#include "mos/tasks/task_types.h"
17
18#include <mos/lib/structures/list.h>
19#include <mos/lib/sync/spinlock.h>
20#include <mos/mos_global.h>
21#include <mos_stdlib.h>
22#include <mos_string.h>
23
24#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
25#include "mos/tasks/process.h"
26#endif
27
28static slab_t *vmap_cache = NULL;
29SLAB_AUTOINIT("vmap", vmap_cache, vmap_t);
30
31static slab_t *mm_context_cache = NULL;
32SLAB_AUTOINIT("mm_context", mm_context_cache, mm_context_t);
33
34phyframe_t *mm_get_free_page_raw(void)
35{
36 phyframe_t *frame = pmm_allocate_frames(n_frames: 1, flags: PMM_ALLOC_NORMAL);
37 if (!frame)
38 {
39 pr_emerg("failed to allocate a page");
40 return NULL;
41 }
42
43 return frame;
44}
45
46phyframe_t *mm_get_free_page(void)
47{
48 phyframe_t *frame = mm_get_free_page_raw();
49 if (!frame)
50 return NULL;
51 memzero(s: (void *) phyframe_va(frame), MOS_PAGE_SIZE);
52 return frame;
53}
54
55phyframe_t *mm_get_free_pages(size_t npages)
56{
57 phyframe_t *frame = pmm_allocate_frames(n_frames: npages, flags: PMM_ALLOC_NORMAL);
58 if (!frame)
59 {
60 pr_emerg("failed to allocate %zd pages", npages);
61 return NULL;
62 }
63
64 return frame;
65}
66
67mm_context_t *mm_create_context(void)
68{
69 mm_context_t *mmctx = kmalloc(mm_context_cache);
70 linked_list_init(head_node: &mmctx->mmaps);
71
72 pml4_t pml4 = pml_create_table(pml4);
73
74 // map the upper half of the address space to the kernel
75 for (int i = pml4_index(MOS_KERNEL_START_VADDR); i < PML4_ENTRIES; i++)
76 pml4.table[i] = platform_info->kernel_mm->pgd.max.next.table[i];
77
78 mmctx->pgd = pgd_create(pml4);
79
80 return mmctx;
81}
82
83void mm_destroy_context(mm_context_t *mmctx)
84{
85 MOS_ASSERT(mmctx != platform_info->kernel_mm); // you can't destroy the kernel mmctx
86 MOS_ASSERT(list_is_empty(&mmctx->mmaps));
87
88 ptr_t zero = 0;
89 size_t userspace_npages = (MOS_USER_END_VADDR + 1) / MOS_PAGE_SIZE;
90 const bool freed = pml5_destroy_range(pml5: mmctx->pgd.max, vaddr: &zero, n_pages: &userspace_npages);
91 MOS_ASSERT_X(freed, "failed to free the entire userspace");
92 kfree(ptr: mmctx);
93}
94
95void mm_lock_ctx_pair(mm_context_t *ctx1, mm_context_t *ctx2)
96{
97 if (ctx1 == ctx2 || ctx2 == NULL)
98 spinlock_acquire(&ctx1->mm_lock);
99 else if (ctx1 < ctx2)
100 {
101 spinlock_acquire(&ctx1->mm_lock);
102 spinlock_acquire(&ctx2->mm_lock);
103 }
104 else
105 {
106 spinlock_acquire(&ctx2->mm_lock);
107 spinlock_acquire(&ctx1->mm_lock);
108 }
109}
110
111void mm_unlock_ctx_pair(mm_context_t *ctx1, mm_context_t *ctx2)
112{
113 if (ctx1 == ctx2 || ctx2 == NULL)
114 spinlock_release(&ctx1->mm_lock);
115 else if (ctx1 < ctx2)
116 {
117 spinlock_release(&ctx2->mm_lock);
118 spinlock_release(&ctx1->mm_lock);
119 }
120 else
121 {
122 spinlock_release(&ctx1->mm_lock);
123 spinlock_release(&ctx2->mm_lock);
124 }
125}
126
127mm_context_t *mm_switch_context(mm_context_t *new_ctx)
128{
129 mm_context_t *old_ctx = current_cpu->mm_context;
130 if (old_ctx == new_ctx)
131 return old_ctx;
132
133 platform_switch_mm(new_mm: new_ctx);
134 current_cpu->mm_context = new_ctx;
135 return old_ctx;
136}
137
138static void do_attach_vmap(mm_context_t *mmctx, vmap_t *vmap)
139{
140 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
141 MOS_ASSERT_X(list_is_empty(list_node(vmap)), "vmap is already attached to something");
142 MOS_ASSERT(vmap->mmctx == NULL || vmap->mmctx == mmctx);
143
144 vmap->mmctx = mmctx;
145
146 // add to the list, sorted by address
147 list_foreach(vmap_t, m, mmctx->mmaps)
148 {
149 if (m->vaddr > vmap->vaddr)
150 {
151 list_insert_before(m, vmap);
152 return;
153 }
154 }
155
156 list_node_append(head: &mmctx->mmaps, list_node(vmap)); // append at the end
157}
158
159vmap_t *vmap_create(mm_context_t *mmctx, ptr_t vaddr, size_t npages)
160{
161 MOS_ASSERT_X(mmctx != platform_info->kernel_mm, "you can't create vmaps in the kernel mmctx");
162 vmap_t *map = kmalloc(vmap_cache);
163 linked_list_init(list_node(map));
164 spinlock_acquire(&map->lock);
165 map->vaddr = vaddr;
166 map->npages = npages;
167 do_attach_vmap(mmctx, vmap: map);
168 return map;
169}
170
171void vmap_destroy(vmap_t *vmap)
172{
173 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
174 mm_context_t *const mm = vmap->mmctx;
175 MOS_ASSERT(spinlock_is_locked(&mm->mm_lock));
176 if (vmap->io)
177 {
178 bool unmapped = false;
179 if (!io_munmap(io: vmap->io, vmap, unmapped: &unmapped))
180 pr_warn("munmap: could not unmap the file: io_munmap() failed");
181
182 if (unmapped)
183 goto unmapped;
184 }
185 mm_do_unmap(top: mm->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: true);
186
187unmapped:
188 list_remove(vmap);
189 kfree(ptr: vmap);
190}
191
192vmap_t *vmap_obtain(mm_context_t *mmctx, ptr_t vaddr, size_t *out_offset)
193{
194 MOS_ASSERT(spinlock_is_locked(&mmctx->mm_lock));
195
196 list_foreach(vmap_t, m, mmctx->mmaps)
197 {
198 if (m->vaddr <= vaddr && vaddr < m->vaddr + m->npages * MOS_PAGE_SIZE)
199 {
200 spinlock_acquire(&m->lock);
201 if (out_offset)
202 *out_offset = vaddr - m->vaddr;
203 return m;
204 }
205 }
206
207 if (out_offset)
208 *out_offset = 0;
209 return NULL;
210}
211
212vmap_t *vmap_split(vmap_t *first, size_t split)
213{
214 MOS_ASSERT(spinlock_is_locked(&first->lock));
215 MOS_ASSERT(split && split < first->npages);
216
217 vmap_t *second = kmalloc(vmap_cache);
218 *second = *first; // copy the whole structure
219 linked_list_init(list_node(second)); // except for the list node
220
221 first->npages = split; // shrink the first vmap
222 second->npages -= split;
223 second->vaddr += split * MOS_PAGE_SIZE;
224 if (first->io)
225 {
226 second->io = io_ref(io: first->io); // ref the io again
227 second->io_offset += split * MOS_PAGE_SIZE;
228 }
229
230 do_attach_vmap(mmctx: first->mmctx, vmap: second);
231 return second;
232}
233
234vmap_t *vmap_split_for_range(vmap_t *vmap, size_t rstart_pgoff, size_t rend_pgoff)
235{
236 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
237
238 /// |-------|-------|-------|
239 /// |begin |rstart |rend |end
240 /// |-------|-------|-------|
241
242 if (rstart_pgoff == 0 && rend_pgoff == vmap->npages)
243 return vmap;
244
245 if (rstart_pgoff == 0)
246 return vmap_split(first: vmap, split: rend_pgoff);
247
248 if (rend_pgoff == vmap->npages)
249 return vmap_split(first: vmap, split: rstart_pgoff);
250
251 vmap_t *second = vmap_split(first: vmap, split: rstart_pgoff);
252 vmap_t *third = vmap_split(first: second, split: rend_pgoff - rstart_pgoff);
253 spinlock_release(&third->lock);
254 return second;
255}
256
257void vmap_finalise_init(vmap_t *vmap, vmap_content_t content, vmap_type_t type)
258{
259 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
260 MOS_ASSERT_X(content != VMAP_UNKNOWN, "vmap content cannot be unknown");
261 MOS_ASSERT_X(vmap->content == VMAP_UNKNOWN || vmap->content == content, "vmap is already setup");
262
263 vmap->content = content;
264 vmap->type = type;
265 spinlock_release(&vmap->lock);
266}
267
268void mm_copy_page(const phyframe_t *src, const phyframe_t *dst)
269{
270 memcpy(dest: (void *) phyframe_va(dst), src: (void *) phyframe_va(src), MOS_PAGE_SIZE);
271}
272
273vmfault_result_t mm_resolve_cow_fault(vmap_t *vmap, ptr_t fault_addr, pagefault_t *info)
274{
275 MOS_ASSERT(spinlock_is_locked(&vmap->lock));
276 MOS_ASSERT(info->is_write && info->is_present);
277
278 // fast path to handle CoW
279 phyframe_t *page = mm_get_free_page();
280 mm_copy_page(src: info->faulting_page, dst: page);
281 mm_replace_page_locked(mmctx: vmap->mmctx, vaddr: fault_addr, phyframe_pfn(page), flags: vmap->vmflags);
282
283 return VMFAULT_COMPLETE;
284}
285
286static void invalid_page_fault(ptr_t fault_addr, vmap_t *faulting_vmap, vmap_t *ip_vmap, pagefault_t *info, const char *unhandled_reason)
287{
288 pr_emerg("unhandled page fault: %s", unhandled_reason);
289#if MOS_CONFIG(MOS_MM_DETAILED_UNHANDLED_FAULT)
290 pr_emerg(" invalid %s mode %s %s page [" PTR_FMT "]", //
291 info->is_user ? "user" : "kernel", //
292 info->is_write ? "write to" : (info->is_exec ? "execute in" : "read from"), //
293 info->is_present ? "present" : "non-present", //
294 fault_addr //
295 );
296
297 pr_emerg(" instruction: " PTR_FMT, info->ip);
298 if (ip_vmap)
299 {
300 pr_emerg(" vmap: %pvm", (void *) ip_vmap);
301 pr_emerg(" offset: 0x%zx", info->ip - ip_vmap->vaddr + (ip_vmap->io ? ip_vmap->io_offset : 0));
302 }
303
304 pr_emerg(" thread: %pt", (void *) current_thread);
305 pr_emerg(" process: %pp", current_thread ? (void *) current_process : NULL);
306
307 if (fault_addr < 1 KB)
308 {
309 if (info->is_write)
310 pr_emerg(" possible write to NULL pointer");
311 else if (info->is_exec && fault_addr == 0)
312 pr_emerg(" attempted to execute NULL pointer");
313 else
314 pr_emerg(" possible NULL pointer dereference");
315 }
316
317 if (info->is_user && fault_addr > MOS_KERNEL_START_VADDR)
318 pr_emerg(" kernel address dereference");
319
320 if (info->ip > MOS_KERNEL_START_VADDR)
321 pr_emerg(" in kernel function %ps", (void *) info->ip);
322
323 if (faulting_vmap)
324 {
325 pr_emerg(" in vmap: %pvm", (void *) faulting_vmap);
326 pr_emerg(" offset: 0x%zx", fault_addr - faulting_vmap->vaddr + (faulting_vmap->io ? faulting_vmap->io_offset : 0));
327 }
328
329 if (faulting_vmap)
330 spinlock_release(&faulting_vmap->lock);
331
332 if (ip_vmap)
333 spinlock_release(&ip_vmap->lock);
334
335 if (current_thread)
336 spinlock_release(&current_thread->owner->mm->mm_lock);
337
338#if MOS_CONFIG(MOS_MM_DETAILED_MMAPS_UNHANDLED_FAULT)
339 if (current_thread)
340 process_dump_mmaps(current_process);
341#endif
342
343 pr_info("stack trace before fault (may be unreliable):");
344 platform_dump_stack(regs: info->regs);
345
346 pr_info("register states before fault:");
347 platform_dump_regs(regs: info->regs);
348 pr_cont("\n");
349#else
350 MOS_UNUSED(fault_addr);
351 MOS_UNUSED(info);
352#endif
353
354 if (current_thread)
355 {
356 signal_send_to_thread(current_thread, SIGSEGV);
357 signal_exit_to_user_prepare(regs: info->regs);
358 }
359 else
360 {
361 MOS_ASSERT(!"unhandled kernel page fault");
362 }
363}
364
365void mm_handle_fault(ptr_t fault_addr, pagefault_t *info)
366{
367 thread_t *current = current_thread;
368 const char *unhandled_reason = NULL;
369
370 pr_demph(pagefault, "%s #PF: %pt, %pp, IP=" PTR_VLFMT ", ADDR=" PTR_VLFMT, //
371 info->is_user ? "user" : "kernel", //
372 current ? (void *) current : NULL, //
373 current ? (void *) current->owner : NULL, //
374 info->ip, //
375 fault_addr //
376 );
377
378 if (info->is_write && info->is_exec)
379 mos_panic("Cannot write and execute at the same time");
380
381 size_t offset = 0;
382 vmap_t *fault_vmap = NULL;
383 vmap_t *ip_vmap = NULL;
384
385 if (!current_mm)
386 {
387 unhandled_reason = "no mm context";
388 goto unhandled_fault;
389 }
390
391 mm_context_t *const mm = current_mm;
392 mm_lock_ctx_pair(ctx1: mm, NULL);
393
394 fault_vmap = vmap_obtain(mmctx: mm, vaddr: fault_addr, out_offset: &offset);
395 if (!fault_vmap)
396 {
397 ip_vmap = vmap_obtain(mmctx: mm, vaddr: info->ip, NULL);
398 unhandled_reason = "page fault in unmapped area";
399 mm_unlock_ctx_pair(ctx1: mm, NULL);
400 goto unhandled_fault;
401 }
402 ip_vmap = MOS_IN_RANGE(info->ip, fault_vmap->vaddr, fault_vmap->vaddr + fault_vmap->npages * MOS_PAGE_SIZE) ? fault_vmap : vmap_obtain(mmctx: mm, vaddr: info->ip, NULL);
403
404 MOS_ASSERT_X(fault_vmap->on_fault, "vmap %pvm has no fault handler", (void *) fault_vmap);
405 const vm_flags page_flags = mm_do_get_flags(max: fault_vmap->mmctx->pgd, vaddr: fault_addr);
406
407 if (info->is_exec && !(fault_vmap->vmflags & VM_EXEC))
408 {
409 unhandled_reason = "page fault in non-executable vmap";
410 mm_unlock_ctx_pair(ctx1: mm, NULL);
411 goto unhandled_fault;
412 }
413 else if (info->is_present && info->is_exec && fault_vmap->vmflags & VM_EXEC && !(page_flags & VM_EXEC))
414 {
415 // vmprotect has been called on this vmap to enable execution
416 // we need to make sure that the page is executable
417 mm_do_flag(top: fault_vmap->mmctx->pgd, vaddr: fault_addr, n_pages: 1, flags: page_flags | VM_EXEC);
418 mm_unlock_ctx_pair(ctx1: mm, NULL);
419 spinlock_release(&fault_vmap->lock);
420 if (ip_vmap)
421 spinlock_release(&ip_vmap->lock);
422 return;
423 }
424
425 if (info->is_write && !(fault_vmap->vmflags & VM_WRITE))
426 {
427 unhandled_reason = "page fault in read-only vmap";
428 mm_unlock_ctx_pair(ctx1: mm, NULL);
429 goto unhandled_fault;
430 }
431
432 if (info->is_present)
433 info->faulting_page = pfn_phyframe(mm_do_get_pfn(fault_vmap->mmctx->pgd, fault_addr));
434
435 static const char *const fault_result_names[] = {
436 [VMFAULT_COMPLETE] = "COMPLETE",
437 [VMFAULT_COPY_BACKING_PAGE] = "COPY_BACKING_PAGE",
438 [VMFAULT_MAP_BACKING_PAGE] = "MAP_BACKING_PAGE",
439 [VMFAULT_MAP_BACKING_PAGE_RO] = "MAP_BACKING_PAGE_RO",
440 [VMFAULT_CANNOT_HANDLE] = "CANNOT_HANDLE",
441 };
442
443 pr_dcont(pagefault, ", handler %ps", (void *) (ptr_t) fault_vmap->on_fault);
444 vmfault_result_t fault_result = fault_vmap->on_fault(fault_vmap, fault_addr, info);
445 pr_dcont(pagefault, " -> %s", fault_result_names[fault_result]);
446
447 vm_flags map_flags = fault_vmap->vmflags;
448 switch (fault_result)
449 {
450 case VMFAULT_COMPLETE: break;
451 case VMFAULT_CANNOT_HANDLE:
452 {
453 unhandled_reason = "vmap fault handler returned VMFAULT_CANNOT_HANDLE";
454 goto unhandled_fault;
455 }
456 case VMFAULT_COPY_BACKING_PAGE:
457 {
458 MOS_ASSERT(info->backing_page && !IS_ERR(info->backing_page));
459 const phyframe_t *page = mm_get_free_page(); // will be ref'd by mm_replace_page_locked()
460 mm_copy_page(src: info->backing_page, dst: page);
461 info->backing_page = page;
462 goto map_backing_page;
463 }
464 case VMFAULT_MAP_BACKING_PAGE_RO:
465 {
466 map_flags &= ~VM_WRITE;
467 goto map_backing_page;
468 }
469 case VMFAULT_MAP_BACKING_PAGE:
470 {
471 map_backing_page:
472 if (!info->backing_page)
473 {
474 unhandled_reason = "out of memory";
475 mm_unlock_ctx_pair(ctx1: mm, NULL);
476 goto unhandled_fault;
477 }
478
479 pr_dcont(pagefault, " (backing page: " PFN_FMT ")", phyframe_pfn(info->backing_page));
480 mm_replace_page_locked(mmctx: fault_vmap->mmctx, vaddr: fault_addr, phyframe_pfn(info->backing_page), flags: map_flags);
481 fault_result = VMFAULT_COMPLETE;
482 }
483 }
484
485 MOS_ASSERT_X(fault_result == VMFAULT_COMPLETE || fault_result == VMFAULT_CANNOT_HANDLE, "invalid fault result %d", fault_result);
486 if (ip_vmap)
487 spinlock_release(&ip_vmap->lock);
488 spinlock_release(&fault_vmap->lock);
489 mm_unlock_ctx_pair(ctx1: mm, NULL);
490 ipi_send_all(type: IPI_TYPE_INVALIDATE_TLB);
491 if (fault_result == VMFAULT_COMPLETE)
492 return;
493
494// if we get here, the fault was not handled
495unhandled_fault:
496 MOS_ASSERT_X(unhandled_reason, "unhandled fault with no reason");
497 invalid_page_fault(fault_addr, faulting_vmap: fault_vmap, ip_vmap, info, unhandled_reason);
498}
499
500// ! sysfs support
501
502static bool sys_mem_mmap(sysfs_file_t *f, vmap_t *vmap, off_t offset)
503{
504 MOS_UNUSED(f);
505 // pr_info("mem: mapping " PTR_VLFMT " to " PTR_VLFMT "\n", vmap->vaddr, offset);
506 mm_do_map(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, pfn: offset / MOS_PAGE_SIZE, n_pages: vmap->npages, flags: vmap->vmflags, do_refcount: false);
507 return true;
508}
509
510static bool sys_mem_munmap(sysfs_file_t *f, vmap_t *vmap, bool *unmapped)
511{
512 MOS_UNUSED(f);
513 mm_do_unmap(top: vmap->mmctx->pgd, vaddr: vmap->vaddr, n_pages: vmap->npages, do_unref: false);
514 *unmapped = true;
515 return true;
516}
517
518static sysfs_item_t sys_mem_item = SYSFS_MEM_ITEM("mem", sys_mem_mmap, sys_mem_munmap);
519
520static void mm_sysfs_init()
521{
522 sys_mem_item.mem.size = platform_info->max_pfn * MOS_PAGE_SIZE;
523 sysfs_register_root_file(item: &sys_mem_item);
524}
525
526MOS_INIT(SYSFS, mm_sysfs_init);
527