From ccb4e9ca1b3cbe3a6ce6d470dcb64b105594bbcc Mon Sep 17 00:00:00 2001 From: Atheria Date: Sun, 2 Nov 2025 06:57:04 +0700 Subject: [PATCH 1/2] WC memops --- CMakeLists.txt | 23 +- arch/x86_64/features/x64.c | 11 +- cmake/configuration.cmake | 3 +- cmake/features.cmake | 4 + drivers/ethernet/Network.c | 15 +- drivers/ethernet/interface/Ip.c | 8 +- kernel/core/Kernel.c | 2 +- mm/MemOps.c | 10 + mm/asm/memcpy.asm | 1343 ++++++++++++++++++++++++++++++- 9 files changed, 1386 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3070483..1f95304 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,23 +174,26 @@ add_custom_target(run -vga vmware -enable-kvm -cdrom ${CMAKE_CURRENT_BINARY_DIR}/VoidFrame.iso - -debugcon file:bootstrap.log - -serial stdio -no-reboot -no-shutdown -m 4G - -drive file=VoidFrameDisk.img,if=ide - -drive file=SataDisk.img,if=none,id=sata0 - -device ahci,id=ahci - -device ide-hd,drive=sata0,bus=ahci.0 -boot d - -device rtl8139 - -device e1000 + # Debug console and serial output + -debugcon file:bootstrap.log + -serial stdio + -parallel file:printer.out + # Network configuration + -netdev user,id=net0 -device rtl8139,netdev=net0 + # USB controller and tablet -device nec-usb-xhci,id=xhci - -device ich9-intel-hda -usb -device usb-tablet + # Audio device -audio pa,id=myaudio + # Disks configuration + -drive file=VoidFrameDisk.img,if=ide + -drive file=SataDisk.img,if=none,id=sata0 + -device ahci,id=ahci + -device ide-hd,drive=sata0,bus=ahci.0 -device sb16,iobase=0x220,irq=5,dma=1,dma16=5 - -parallel file:printer.out -drive file=VirtioDisk.img,format=raw,id=virtio_disk,if=none -device virtio-blk-pci,drive=virtio_disk,disable-legacy=on -drive file=NVMeDisk.img,format=raw,id=nvme_disk,if=none diff --git a/arch/x86_64/features/x64.c b/arch/x86_64/features/x64.c index 15759e2..eccbce1 100644 --- a/arch/x86_64/features/x64.c +++ b/arch/x86_64/features/x64.c @@ -62,8 +62,9 @@ void CpuInit(void) { // Most importantly, check if the CPU supports OSXSAVE (bit 27 of ECX). // If this is not set, the OS is not allowed to set XCR0 to enable AVX. cpu_features.osxsave = (ecx >> 27) & 1; +#ifndef VF_CONFIG_VM_HOST if (!cpu_features.osxsave) { - PrintKernelWarning("System: CPU: OSXSAVE not supported. AVX/2/512F will be disabled.\n"); + PrintKernel("System: CPU: OSXSAVE not supported. AVX/2/512F will be disabled.\n"); cpu_features.avx = false; cpu_features.avx2 = false; cpu_features.avx512f = false; @@ -71,6 +72,14 @@ void CpuInit(void) { return; } PrintKernelSuccess("System: CPU: OSXSAVE supported.\n"); +#else + PrintKernel("System: CPU: OSXSAVE not supported. AVX/2/512F will be disabled.\n"); + cpu_features.avx = false; + cpu_features.avx2 = false; + cpu_features.avx512f = false; + CPUFeatureValidation(); + return; +#endif // --- Step 3: Enable AVX by setting the XCR0 Control Register --- // The OS must set bits 1 (SSE state) and 2 (AVX state) in XCR0. diff --git a/cmake/configuration.cmake b/cmake/configuration.cmake index b8b211e..b301f52 100644 --- a/cmake/configuration.cmake +++ b/cmake/configuration.cmake @@ -35,4 +35,5 @@ option(VF_CONFIG_USE_CERBERUS "Use Cerberus" ON) option(VF_CONFIG_CERBERUS_STACK_PROTECTION "Enable Cerberus stack protection" ON) option(VF_CONFIG_INTEL "Enable Intel-specific optimizations" ON) option(VF_CONFIG_ENABLE_OPIC "Enable OPIC support" ON) -option(VF_CONFIG_VESA_FB "Enable VESA framebuffer support" ON) \ No newline at end of file +option(VF_CONFIG_VESA_FB "Enable VESA framebuffer support" ON) +option(VF_CONFIG_MEMCPY_NT "Enable non-temporal memcpy optimizations" OFF) \ No newline at end of file diff --git a/cmake/features.cmake b/cmake/features.cmake index c19b07b..d166346 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -97,6 +97,10 @@ if(AUTOMATIC_POST) add_compile_definitions(VF_CONFIG_AUTOMATIC_POST) endif() +if(VF_CONFIG_MEMCPY_NT) + add_compile_definitions(VF_CONFIG_MEMCPY_NT) +endif() + if(VF_SCHEDULER STREQUAL "MLFQ") add_compile_definitions(VF_CONFIG_SCHED_MLFQ) elseif(VF_SCHEDULER STREQUAL "EEVDF") diff --git a/drivers/ethernet/Network.c b/drivers/ethernet/Network.c index 37a651b..cdc265e 100644 --- a/drivers/ethernet/Network.c +++ b/drivers/ethernet/Network.c @@ -1,5 +1,6 @@ #include "Network.h" #include "Console.h" +#include "StringOps.h" #include "intel/E1000.h" #include "interface/Arp.h" #include "realtek/RTL8139.h" @@ -12,7 +13,6 @@ void Net_Initialize(void) { g_device_count = 0; ArpInit(); - // Try to initialize E1000 if (E1000_Init() == 0) { Net_RegisterDevice("E1000", (send_packet_t)E1000_SendPacket, (get_mac_t)E1000_GetDevice, E1000_HandleReceive); @@ -29,14 +29,7 @@ void Net_Initialize(void) { void Net_RegisterDevice(const char* name, send_packet_t sender, get_mac_t mac_getter, poll_receive_t poller) { if (g_device_count < MAX_NETWORK_DEVICES) { NetworkDevice* dev = &g_network_devices[g_device_count++]; - // Simple string copy - int i = 0; - while(name[i] != '\0' && i < 31) { - dev->name[i] = name[i]; - i++; - } - dev->name[i] = '\0'; - + FastStrCopy(dev->name, name, 256); dev->send_packet = sender; dev->get_mac_address = mac_getter; dev->poll_receive = poller; @@ -48,6 +41,10 @@ void Net_RegisterDevice(const char* name, send_packet_t sender, get_mac_t mac_ge } } +void Net_UnregisterDevice() { + +} + NetworkDevice* Net_GetDevice(int index) { if (index < g_device_count) { return &g_network_devices[index]; diff --git a/drivers/ethernet/interface/Ip.c b/drivers/ethernet/interface/Ip.c index f4f0787..c02f747 100644 --- a/drivers/ethernet/interface/Ip.c +++ b/drivers/ethernet/interface/Ip.c @@ -24,10 +24,10 @@ static uint16_t IpChecksum(const void* data, size_t length) { } void IpSend(uint8_t dest_ip[4], uint8_t protocol, const void* data, uint16_t len) { - NetworkDevice* net_dev = Net_GetDevice(0); - if (!net_dev) { - PrintKernel("IP: No network device found.\n"); - return; + NetworkDevice* net_dev = NULL; + for (int i = 0; i < MAX_NETWORK_DEVICES; i++) { + net_dev = Net_GetDevice(i); + if (net_dev) break; } uint8_t dest_mac[6]; diff --git a/kernel/core/Kernel.c b/kernel/core/Kernel.c index 56ddb81..74ab119 100644 --- a/kernel/core/Kernel.c +++ b/kernel/core/Kernel.c @@ -760,7 +760,7 @@ static InitResultT PXS2(void) { // Load multiboot modules PrintKernel("Info: Loading multiboot modules...\n"); InitRDLoad(); - PrintKernelSuccess("System: Multiboot modules loaded\n"); + PrintKernelSuccess("System: Multiboot modules wed\n"); #endif PrintKernel("Info: Initializing CRC32...\n"); diff --git a/mm/MemOps.c b/mm/MemOps.c index d57edfb..e076422 100644 --- a/mm/MemOps.c +++ b/mm/MemOps.c @@ -7,6 +7,10 @@ extern void* memcpy_internal_sse2(void* restrict dest, const void* restrict src, extern void* memcpy_internal_avx2(void* restrict dest, const void* restrict src, uint64_t size); extern void* memcpy_internal_avx512(void* restrict dest, const void* restrict src, uint64_t size); +extern void* memcpy_internal_sse2_wc(void* restrict dest, const void* restrict src, uint64_t size); +extern void* memcpy_internal_avx2_wc(void* restrict dest, const void* restrict src, uint64_t size); +extern void* memcpy_internal_avx512_wc(void* restrict dest, const void* restrict src, uint64_t size); + extern void* memset_internal_sse2(void* restrict dest, int value, uint64_t size); extern void* memset_internal_avx2(void* restrict dest, int value, uint64_t size); extern void* memset_internal_avx512(void* restrict dest, int value, uint64_t size); @@ -60,9 +64,15 @@ void* FastMemcpy(void* restrict dest, const void* restrict src, uint64_t size) { const CpuFeatures * features = GetCpuFeatures(); +#ifdef VF_CONFIG_MEMCPY_NT if (features->avx512f) return memcpy_internal_avx512(d, s, size); if (features->avx2) return memcpy_internal_avx2(d, s, size); if (features->sse2) return memcpy_internal_sse2(d, s, size); +#else + if (features->avx512f) return memcpy_internal_avx512_wc(d, s, size); + if (features->avx2) return memcpy_internal_avx2_wc(d, s, size); + if (features->sse2) return memcpy_internal_sse2_wc(d, s, size); +#endif while (size--) *d++ = *s++; diff --git a/mm/asm/memcpy.asm b/mm/asm/memcpy.asm index 70253d2..03fd73e 100644 --- a/mm/asm/memcpy.asm +++ b/mm/asm/memcpy.asm @@ -4,6 +4,9 @@ global memcpy_internal_sse2 global memcpy_internal_avx2 global memcpy_internal_avx512 +global memcpy_internal_sse2_wc +global memcpy_internal_avx2_wc +global memcpy_internal_avx512_wc ; Function: memcpy_internal_sse2 (SSE2 optimized) ; Inputs: @@ -28,6 +31,7 @@ global memcpy_internal_avx512 ; rdx - number of bytes to copy ; Outputs: ; rax - pointer to memory destination +; snip... same for _wc versions (_wc indicates write-combining stores) section .text @@ -82,7 +86,7 @@ memcpy_internal_sse2: movntdq [rdi + 80], xmm5 movntdq [rdi + 96], xmm6 movntdq [rdi + 112], xmm7 - + add rsi, 128 add rdi, 128 @@ -107,7 +111,7 @@ memcpy_internal_sse2: movntdq [rdi + 16], xmm1 movntdq [rdi + 32], xmm2 movntdq [rdi + 48], xmm3 - + add rsi, 64 add rdi, 64 @@ -126,7 +130,7 @@ memcpy_internal_sse2: ; Use non-temporal stores movntdq [rdi], xmm0 movntdq [rdi + 16], xmm1 - + add rsi, 32 add rdi, 32 @@ -143,7 +147,7 @@ memcpy_internal_sse2: ; Use non-temporal stores movntdq [rdi], xmm0 - + add rsi, 16 add rdi, 16 @@ -316,6 +320,288 @@ memcpy_internal_sse2: mov rax, rdi ret +; Standard memcpy using SSE2 (Write-Combining) +memcpy_internal_sse2_wc: + ; Save registers we will use + push rbx + push rcx + push rsi + + ; rdi = dest + ; rsi = src + ; rdx = count + + ; Check for zero count + test rdx, rdx + jz .wc_done + + ; Check for small counts + cmp rdx, 128 + jb .wc_copy_small + + ; Check memory alignment for optimal performance + test rdi, 15 + jnz .wc_copy_unaligned + + test rsi, 15 + jnz .wc_copy_unaligned + + ; Aligned copy using SSE2 + mov rcx, rdx + shr rcx, 7 ; count / 128 + jz .wc_copy_64_bytes + +.wc_copy_128_bytes_loop: + ; Load 128 bytes using SSE2 + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + movdqa xmm2, [rsi + 32] + movdqa xmm3, [rsi + 48] + movdqa xmm4, [rsi + 64] + movdqa xmm5, [rsi + 80] + movdqa xmm6, [rsi + 96] + movdqa xmm7, [rsi + 112] + + ; Use standard stores + movdqa [rdi], xmm0 + movdqa [rdi + 16], xmm1 + movdqa [rdi + 32], xmm2 + movdqa [rdi + 48], xmm3 + movdqa [rdi + 64], xmm4 + movdqa [rdi + 80], xmm5 + movdqa [rdi + 96], xmm6 + movdqa [rdi + 112], xmm7 + + add rsi, 128 + add rdi, 128 + + dec rcx + jnz .wc_copy_128_bytes_loop + + ; Process remaining 64 bytes + mov rcx, rdx + shr rcx, 6 ; count / 64 + and rcx, 1 ; Check if we have 64 bytes remaining + jz .wc_copy_32_bytes + +.wc_copy_64_bytes: + ; Load 64 bytes using SSE2 + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + movdqa xmm2, [rsi + 32] + movdqa xmm3, [rsi + 48] + + ; Use standard stores + movdqa [rdi], xmm0 + movdqa [rdi + 16], xmm1 + movdqa [rdi + 32], xmm2 + movdqa [rdi + 48], xmm3 + + add rsi, 64 + add rdi, 64 + +.wc_copy_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .wc_copy_16_bytes + +.wc_copy_32_bytes_loop: + ; Load 32 bytes using SSE2 + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + + ; Use standard stores + movdqa [rdi], xmm0 + movdqa [rdi + 16], xmm1 + + add rsi, 32 + add rdi, 32 + +.wc_copy_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .wc_copy_8_bytes + +.wc_copy_16_bytes_loop: + ; Load 16 bytes using SSE2 + movdqa xmm0, [rsi] + + ; Use standard stores + movdqa [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.wc_copy_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .wc_copy_4_bytes + +.wc_copy_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + + .wc_copy_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .wc_copy_2_bytes + +.wc_copy_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.wc_copy_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .wc_copy_1_byte + +.wc_copy_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.wc_copy_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + +.wc_copy_done: + jmp .wc_done + +.wc_copy_unaligned: + ; Unaligned copy using SSE2 + mov rcx, rdx + shr rcx, 4 ; count / 16 + jz .wc_copy_unaligned_8_bytes + +.wc_copy_unaligned_loop: + ; Load 16 bytes using unaligned SSE2 + movdqu xmm0, [rsi] + + ; Store 16 bytes using unaligned SSE2 + movdqu [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz .wc_copy_unaligned_loop + +.wc_copy_unaligned_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .wc_copy_unaligned_4_bytes + +.wc_copy_unaligned_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.wc_copy_unaligned_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .wc_copy_unaligned_2_bytes + +.wc_copy_unaligned_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.wc_copy_unaligned_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .wc_copy_unaligned_1_byte + +.wc_copy_unaligned_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.wc_copy_unaligned_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + + jmp .wc_copy_done + +.wc_copy_small: + ; Copy small blocks using optimized byte-by-byte copy + mov rcx, rdx + cld + rep movsb + + jmp .wc_done + +.wc_done: + ; Restore registers + pop rsi + pop rcx + pop rbx + + ; Return destination pointer + mov rax, rdi + ret + ; Advanced memcpy using AVX2 memcpy_internal_avx2: ; Save registers we will use @@ -642,10 +928,1053 @@ memcpy_internal_avx2: ; Return destination pointer mov rax, rdi - ret +ret -; Ultra-fast memcpy using AVX-512 -memcpy_internal_avx512: +; Advanced memcpy using AVX2 (Write-Combining) +memcpy_internal_avx2_wc: + ; Save registers we will use + push rbx + push rcx + push rsi + + ; rdi = dest + ; rsi = src + ; rdx = count + + ; Check for zero count + test rdx, rdx + jz .avx2_wc_done + + ; Check for AVX2 support + push rbx + mov eax, 7 + xor ecx, ecx + cpuid + pop rbx + test ebx, 1 << 5 ; Check AVX2 bit + jz memcpy_internal_sse2_wc ; Fall back to SSE2 WC implementation + + ; Check for small counts + cmp rdx, 256 + jb .avx2_wc_copy_small + + ; Check memory alignment for optimal performance + test rdi, 31 + jnz .avx2_wc_copy_unaligned + + test rsi, 31 + jnz .avx2_wc_copy_unaligned + + ; Aligned copy using AVX2 + mov rcx, rdx + shr rcx, 8 ; count / 256 + jz .avx2_wc_copy_128_bytes + +.avx2_wc_copy_256_bytes_loop: + ; Load 256 bytes using AVX2 + vmovdqa ymm0, [rsi] + vmovdqa ymm1, [rsi + 32] + vmovdqa ymm2, [rsi + 64] + vmovdqa ymm3, [rsi + 96] + vmovdqa ymm4, [rsi + 128] + vmovdqa ymm5, [rsi + 160] + vmovdqa ymm6, [rsi + 192] + vmovdqa ymm7, [rsi + 224] + + ; Use standard stores + vmovdqa [rdi], ymm0 + vmovdqa [rdi + 32], ymm1 + vmovdqa [rdi + 64], ymm2 + vmovdqa [rdi + 96], ymm3 + vmovdqa [rdi + 128], ymm4 + vmovdqa [rdi + 160], ymm5 + vmovdqa [rdi + 192], ymm6 + vmovdqa [rdi + 224], ymm7 + + add rsi, 256 + add rdi, 256 + + dec rcx + jnz .avx2_wc_copy_256_bytes_loop + + ; Process remaining 128 bytes + mov rcx, rdx + shr rcx, 7 ; count / 128 + and rcx, 1 ; Check if we have 128 bytes remaining + jz .avx2_wc_copy_64_bytes + +.avx2_wc_copy_128_bytes: + ; Load 128 bytes using AVX2 + vmovdqa ymm0, [rsi] + vmovdqa ymm1, [rsi + 32] + vmovdqa ymm2, [rsi + 64] + vmovdqa ymm3, [rsi + 96] + + ; Use standard stores + vmovdqa [rdi], ymm0 + vmovdqa [rdi + 32], ymm1 + vmovdqa [rdi + 64], ymm2 + vmovdqa [rdi + 96], ymm3 + + add rsi, 128 + add rdi, 128 + +.avx2_wc_copy_64_bytes: + ; Process remaining 64 bytes + mov rcx, rdx + shr rcx, 6 ; count / 64 + and rcx, 1 ; Check if we have 64 bytes remaining + jz .avx2_wc_copy_32_bytes + +.avx2_wc_copy_64_bytes_loop: + ; Load 64 bytes using AVX2 + vmovdqa ymm0, [rsi] + vmovdqa ymm1, [rsi + 32] + + ; Use standard stores + vmovdqa [rdi], ymm0 + vmovdqa [rdi + 32], ymm1 + + add rsi, 64 + add rdi, 64 + +.avx2_wc_copy_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .avx2_wc_copy_16_bytes + +.avx2_wc_copy_32_bytes_loop: + ; Load 32 bytes using AVX2 + vmovdqa ymm0, [rsi] + + ; Use standard stores + vmovdqa [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + +.avx2_wc_copy_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx2_wc_copy_8_bytes + +.avx2_wc_copy_16_bytes_loop: + ; Load 16 bytes using SSE + movdqa xmm0, [rsi] + + ; Use standard stores + movdqa [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx2_wc_copy_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx2_wc_copy_4_bytes + +.avx2_wc_copy_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx2_wc_copy_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx2_wc_copy_2_bytes + +.avx2_wc_copy_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx2_wc_copy_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx2_wc_copy_1_byte + +.avx2_wc_copy_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx2_wc_copy_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx2_wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + +.avx2_wc_copy_done: + jmp .avx2_wc_done + +.avx2_wc_copy_unaligned: + ; Unaligned copy using AVX2 + mov rcx, rdx + shr rcx, 5 ; count / 32 + jz .avx2_wc_copy_unaligned_16_bytes + +.avx2_wc_copy_unaligned_loop: + ; Load 32 bytes using unaligned AVX2 + vmovdqu ymm0, [rsi] + + ; Store 32 bytes using unaligned AVX2 + vmovdqu [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + + dec rcx + jnz .avx2_wc_copy_unaligned_loop + +.avx2_wc_copy_unaligned_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx2_wc_copy_unaligned_8_bytes + +.avx2_wc_copy_unaligned_16_bytes_loop: + ; Load 16 bytes using unaligned SSE + movdqu xmm0, [rsi] + + ; Store 16 bytes using unaligned SSE + movdqu [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx2_wc_copy_unaligned_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx2_wc_copy_unaligned_4_bytes + +.avx2_wc_copy_unaligned_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx2_wc_copy_unaligned_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx2_wc_copy_unaligned_2_bytes + +.avx2_wc_copy_unaligned_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx2_wc_copy_unaligned_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx2_wc_copy_unaligned_1_byte + +.avx2_wc_copy_unaligned_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx2_wc_copy_unaligned_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx2_wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + + jmp .avx2_wc_copy_done + +.avx2_wc_copy_small: + ; Copy small blocks using optimized byte-by-byte copy + mov rcx, rdx + cld + rep movsb + + jmp .avx2_wc_done + +.avx2_wc_done: + ; Restore registers + pop rsi + pop rcx + pop rbx + + ; Return destination pointer + mov rax, rdi + ret + +; Ultra-fast memcpy using AVX-512 +memcpy_internal_avx512: + ; Save registers we will use + push rbx + push rcx + push rsi + + ; rdi = dest + ; rsi = src + ; rdx = count + + ; Check for zero count + test rdx, rdx + jz .avx512_done + + ; Check for AVX-512 support + push rbx + mov eax, 7 + xor ecx, ecx + cpuid + pop rbx + test ebx, 1 << 16 ; Check AVX512F bit + jz memcpy_internal_avx2 ; Fall back to AVX2 implementation + + ; Check for small counts + cmp rdx, 512 + jb .avx512_copy_small + + ; Check memory alignment for optimal performance + test rdi, 63 + jnz .avx512_copy_unaligned + + test rsi, 63 + jnz .avx512_copy_unaligned + + ; Aligned copy using AVX-512 with non-temporal stores for large copies + mov rcx, rdx + shr rcx, 9 ; count / 512 + jz .avx512_copy_256_bytes + +.avx512_copy_512_bytes_loop: + ; Load 512 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + vmovdqa64 zmm2, [rsi + 128] + vmovdqa64 zmm3, [rsi + 192] + vmovdqa64 zmm4, [rsi + 256] + vmovdqa64 zmm5, [rsi + 320] + vmovdqa64 zmm6, [rsi + 384] + vmovdqa64 zmm7, [rsi + 448] + + ; Use non-temporal stores to avoid polluting the cache + vmovntdq [rdi], zmm0 + vmovntdq [rdi + 64], zmm1 + vmovntdq [rdi + 128], zmm2 + vmovntdq [rdi + 192], zmm3 + vmovntdq [rdi + 256], zmm4 + vmovntdq [rdi + 320], zmm5 + vmovntdq [rdi + 384], zmm6 + vmovntdq [rdi + 448], zmm7 + + add rsi, 512 + add rdi, 512 + + dec rcx + jnz .avx512_copy_512_bytes_loop + + ; Process remaining 256 bytes + mov rcx, rdx + shr rcx, 8 ; count / 256 + and rcx, 1 ; Check if we have 256 bytes remaining + jz .avx512_copy_128_bytes + +.avx512_copy_256_bytes: + ; Load 256 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + vmovdqa64 zmm2, [rsi + 128] + vmovdqa64 zmm3, [rsi + 192] + + ; Use non-temporal stores + vmovntdq [rdi], zmm0 + vmovntdq [rdi + 64], zmm1 + vmovntdq [rdi + 128], zmm2 + vmovntdq [rdi + 192], zmm3 + + add rsi, 256 + add rdi, 256 + +.avx512_copy_128_bytes: + ; Process remaining 128 bytes + mov rcx, rdx + shr rcx, 7 ; count / 128 + and rcx, 1 ; Check if we have 128 bytes remaining + jz .avx512_copy_64_bytes + +.avx512_copy_128_bytes_loop: + ; Load 128 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + + ; Use non-temporal stores + vmovntdq [rdi], zmm0 + vmovntdq [rdi + 64], zmm1 + + add rsi, 128 + add rdi, 128 + +.avx512_copy_64_bytes: + ; Process remaining 64 bytes + mov rcx, rdx + shr rcx, 6 ; count / 64 + and rcx, 1 ; Check if we have 64 bytes remaining + jz .avx512_copy_32_bytes + +.avx512_copy_64_bytes_loop: + ; Load 64 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + + ; Use non-temporal stores + vmovntdq [rdi], zmm0 + + add rsi, 64 + add rdi, 64 + +.avx512_copy_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .avx512_copy_16_bytes + +.avx512_copy_32_bytes_loop: + ; Load 32 bytes using AVX2 + vmovdqa ymm0, [rsi] + + ; Use non-temporal stores + vmovntdq [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + +.avx512_copy_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx512_copy_8_bytes + +.avx512_copy_16_bytes_loop: + ; Load 16 bytes using SSE + movdqa xmm0, [rsi] + + ; Use non-temporal stores + movntdq [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx512_copy_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx512_copy_4_bytes + +.avx512_copy_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx512_copy_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx512_copy_2_bytes + +.avx512_copy_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx512_copy_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx512_copy_1_byte + +.avx512_copy_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx512_copy_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx512_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + +.avx512_copy_done: + ; SFENCE to ensure non-temporal stores are visible + sfence + + jmp .avx512_done + +.avx512_copy_unaligned: + ; Unaligned copy using AVX-512 + mov rcx, rdx + shr rcx, 6 ; count / 64 + jz .avx512_copy_unaligned_32_bytes + +.avx512_copy_unaligned_loop: + ; Load 64 bytes using unaligned AVX-512 + vmovdqu64 zmm0, [rsi] + + ; Store 64 bytes using unaligned AVX-512 + vmovdqu64 [rdi], zmm0 + + add rsi, 64 + add rdi, 64 + + dec rcx + jnz .avx512_copy_unaligned_loop + +.avx512_copy_unaligned_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .avx512_copy_unaligned_16_bytes + +.avx512_copy_unaligned_32_bytes_loop: + ; Load 32 bytes using unaligned AVX2 + vmovdqu ymm0, [rsi] + + ; Store 32 bytes using unaligned AVX2 + vmovdqu [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + +.avx512_copy_unaligned_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx512_copy_unaligned_8_bytes + +.avx512_copy_unaligned_16_bytes_loop: + ; Load 16 bytes using unaligned SSE + movdqu xmm0, [rsi] + + ; Store 16 bytes using unaligned SSE + movdqu [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx512_copy_unaligned_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx512_copy_unaligned_4_bytes + +.avx512_copy_unaligned_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx512_copy_unaligned_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx512_copy_unaligned_2_bytes + +.avx512_copy_unaligned_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx512_copy_unaligned_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx512_copy_unaligned_1_byte + +.avx512_copy_unaligned_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx512_copy_unaligned_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx512_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + + jmp .avx512_copy_done + +.avx512_copy_small: + ; Copy small blocks using optimized byte-by-byte copy + mov rcx, rdx + cld + rep movsb + + jmp .avx512_done + +.avx512_done: + ; Restore registers + pop rsi + pop rcx + pop rbx + + ; Return destination pointer + mov rax, rdi +ret + +; Ultra-fast memcpy using AVX-512 (Write-Combining) +memcpy_internal_avx512_wc: + ; Save registers we will use + push rbx + push rcx + push rsi + + ; rdi = dest + ; rsi = src + ; rdx = count + + ; Check for zero count + test rdx, rdx + jz .avx512_wc_done + + ; Check for AVX-512 support + push rbx + mov eax, 7 + xor ecx, ecx + cpuid + pop rbx + test ebx, 1 << 16 ; Check AVX512F bit + jz memcpy_internal_avx2_wc ; Fall back to AVX2 WC implementation + + ; Check for small counts + cmp rdx, 512 + jb .avx512_wc_copy_small + + ; Check memory alignment for optimal performance + test rdi, 63 + jnz .avx512_wc_copy_unaligned + + test rsi, 63 + jnz .avx512_wc_copy_unaligned + + ; Aligned copy using AVX-512 + mov rcx, rdx + shr rcx, 9 ; count / 512 + jz .avx512_wc_copy_256_bytes + +.avx512_wc_copy_512_bytes_loop: + ; Load 512 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + vmovdqa64 zmm2, [rsi + 128] + vmovdqa64 zmm3, [rsi + 192] + vmovdqa64 zmm4, [rsi + 256] + vmovdqa64 zmm5, [rsi + 320] + vmovdqa64 zmm6, [rsi + 384] + vmovdqa64 zmm7, [rsi + 448] + + ; Use standard stores + vmovdqa64 [rdi], zmm0 + vmovdqa64 [rdi + 64], zmm1 + vmovdqa64 [rdi + 128], zmm2 + vmovdqa64 [rdi + 192], zmm3 + vmovdqa64 [rdi + 256], zmm4 + vmovdqa64 [rdi + 320], zmm5 + vmovdqa64 [rdi + 384], zmm6 + vmovdqa64 [rdi + 448], zmm7 + + add rsi, 512 + add rdi, 512 + + dec rcx + jnz .avx512_wc_copy_512_bytes_loop + + ; Process remaining 256 bytes + mov rcx, rdx + shr rcx, 8 ; count / 256 + and rcx, 1 ; Check if we have 256 bytes remaining + jz .avx512_wc_copy_128_bytes + +.avx512_wc_copy_256_bytes: + ; Load 256 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + vmovdqa64 zmm2, [rsi + 128] + vmovdqa64 zmm3, [rsi + 192] + + ; Use standard stores + vmovdqa64 [rdi], zmm0 + vmovdqa64 [rdi + 64], zmm1 + vmovdqa64 [rdi + 128], zmm2 + vmovdqa64 [rdi + 192], zmm3 + + add rsi, 256 + add rdi, 256 + +.avx512_wc_copy_128_bytes: + ; Process remaining 128 bytes + mov rcx, rdx + shr rcx, 7 ; count / 128 + and rcx, 1 ; Check if we have 128 bytes remaining + jz .avx512_wc_copy_64_bytes + +.avx512_wc_copy_128_bytes_loop: + ; Load 128 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + vmovdqa64 zmm1, [rsi + 64] + + ; Use standard stores + vmovdqa64 [rdi], zmm0 + vmovdqa64 [rdi + 64], zmm1 + + add rsi, 128 + add rdi, 128 + +.avx512_wc_copy_64_bytes: + ; Process remaining 64 bytes + mov rcx, rdx + shr rcx, 6 ; count / 64 + and rcx, 1 ; Check if we have 64 bytes remaining + jz .avx512_wc_copy_32_bytes + +.avx512_wc_copy_64_bytes_loop: + ; Load 64 bytes using AVX-512 + vmovdqa64 zmm0, [rsi] + + ; Use standard stores + vmovdqa64 [rdi], zmm0 + + add rsi, 64 + add rdi, 64 + +.avx512_wc_copy_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .avx512_wc_copy_16_bytes + +.avx512_wc_copy_32_bytes_loop: + ; Load 32 bytes using AVX2 + vmovdqa ymm0, [rsi] + + ; Use standard stores + vmovdqa [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + +.avx512_wc_copy_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx512_wc_copy_8_bytes + +.avx512_wc_copy_16_bytes_loop: + ; Load 16 bytes using SSE + movdqa xmm0, [rsi] + + ; Use standard stores + movdqa [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx512_wc_copy_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx512_wc_copy_4_bytes + +.avx512_wc_copy_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx512_wc_copy_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx512_wc_copy_2_bytes + +.avx512_wc_copy_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx512_wc_copy_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx512_wc_copy_1_byte + +.avx512_wc_copy_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx512_wc_copy_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx512_wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + +.avx512_wc_copy_done: + jmp .avx512_wc_done + +.avx512_wc_copy_unaligned: + ; Unaligned copy using AVX-512 + mov rcx, rdx + shr rcx, 6 ; count / 64 + jz .avx512_wc_copy_unaligned_32_bytes + +.avx512_wc_copy_unaligned_loop: + ; Load 64 bytes using unaligned AVX-512 + vmovdqu64 zmm0, [rsi] + + ; Store 64 bytes using unaligned AVX-512 + vmovdqu64 [rdi], zmm0 + + add rsi, 64 + add rdi, 64 + + dec rcx + jnz .avx512_wc_copy_unaligned_loop + +.avx512_wc_copy_unaligned_32_bytes: + ; Process remaining 32 bytes + mov rcx, rdx + shr rcx, 5 ; count / 32 + and rcx, 1 ; Check if we have 32 bytes remaining + jz .avx512_wc_copy_unaligned_16_bytes + +.avx512_wc_copy_unaligned_32_bytes_loop: + ; Load 32 bytes using unaligned AVX2 + vmovdqu ymm0, [rsi] + + ; Store 32 bytes using unaligned AVX2 + vmovdqu [rdi], ymm0 + + add rsi, 32 + add rdi, 32 + +.avx512_wc_copy_unaligned_16_bytes: + ; Process remaining 16 bytes + mov rcx, rdx + shr rcx, 4 ; count / 16 + and rcx, 1 ; Check if we have 16 bytes remaining + jz .avx512_wc_copy_unaligned_8_bytes + +.avx512_wc_copy_unaligned_16_bytes_loop: + ; Load 16 bytes using unaligned SSE + movdqu xmm0, [rsi] + + ; Store 16 bytes using unaligned SSE + movdqu [rdi], xmm0 + + add rsi, 16 + add rdi, 16 + +.avx512_wc_copy_unaligned_8_bytes: + ; Process remaining 8 bytes + mov rcx, rdx + shr rcx, 3 ; count / 8 + and rcx, 1 ; Check if we have 8 bytes remaining + jz .avx512_wc_copy_unaligned_4_bytes + +.avx512_wc_copy_unaligned_8_bytes_loop: + ; Load 8 bytes + mov rax, [rsi] + + ; Store 8 bytes + mov [rdi], rax + + add rsi, 8 + add rdi, 8 + +.avx512_wc_copy_unaligned_4_bytes: + ; Process remaining 4 bytes + mov rcx, rdx + shr rcx, 2 ; count / 4 + and rcx, 1 ; Check if we have 4 bytes remaining + jz .avx512_wc_copy_unaligned_2_bytes + +.avx512_wc_copy_unaligned_4_bytes_loop: + ; Load 4 bytes + mov eax, [rsi] + + ; Store 4 bytes + mov [rdi], eax + + add rsi, 4 + add rdi, 4 + +.avx512_wc_copy_unaligned_2_bytes: + ; Process remaining 2 bytes + mov rcx, rdx + shr rcx, 1 ; count / 2 + and rcx, 1 ; Check if we have 2 bytes remaining + jz .avx512_wc_copy_unaligned_1_byte + +.avx512_wc_copy_unaligned_2_bytes_loop: + ; Load 2 bytes + mov ax, [rsi] + + ; Store 2 bytes + mov [rdi], ax + + add rsi, 2 + add rdi, 2 + +.avx512_wc_copy_unaligned_1_byte: + ; Process remaining byte + and rdx, 1 ; count % 2 + jz .avx512_wc_copy_done + + ; Load 1 byte + mov al, [rsi] + + ; Store 1 byte + mov [rdi], al + + jmp .avx512_wc_copy_done + +.avx512_wc_copy_small: + ; Copy small blocks using optimized byte-by-byte copy + mov rcx, rdx + cld + rep movsb + + jmp .avx512_wc_done + +.avx512_wc_done: + ; Restore registers + pop rsi + pop rcx + pop rbx + + ; Return destination pointer + mov rax, rdi + ret ; Save registers we will use push rbx push rcx From c9ccaf593278bc18ad9cefc28a4125c9668b6989 Mon Sep 17 00:00:00 2001 From: Atheria Date: Sun, 2 Nov 2025 06:59:00 +0700 Subject: [PATCH 2/2] WC memops --- README.md | 3 +-- kernel/core/Kernel.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ea183fb..60111ea 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,7 @@ mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/linux-x64.cmake \ - -G Ninja \ - -DVF_SCHEDULER=EEVDF + -G Ninja ccmake . # Optinal, tune as needed ninja -j$(nproc) ninja run diff --git a/kernel/core/Kernel.c b/kernel/core/Kernel.c index 74ab119..7a9f920 100644 --- a/kernel/core/Kernel.c +++ b/kernel/core/Kernel.c @@ -760,7 +760,7 @@ static InitResultT PXS2(void) { // Load multiboot modules PrintKernel("Info: Loading multiboot modules...\n"); InitRDLoad(); - PrintKernelSuccess("System: Multiboot modules wed\n"); + PrintKernelSuccess("System: Multiboot modules loadded\n"); #endif PrintKernel("Info: Initializing CRC32...\n");