:: commit 57ae8f017aa8c0647facba1e52db9beaeab484cf

lihanrui2913 <lihanrui20091103@126.com> — 2026-02-15 02:29

parents: 7f8f59f876

loongarch64: Add smp support

diff --git a/common/lib/acpi.h b/common/lib/acpi.h
index d0e68031..40c93643 100644
--- a/common/lib/acpi.h
+++ b/common/lib/acpi.h
@@ -171,6 +171,17 @@ struct madt_riscv_intc {
 #define MADT_RISCV_INTC_ENABLED        ((uint32_t)1 << 0)
 #define MADT_RISCV_INTC_ONLINE_CAPABLE ((uint32_t)1 << 1)
 
+struct madt_core_pic {
+    struct madt_header header;
+    uint8_t  version;
+    uint32_t acpi_processor_uid;
+    uint32_t core_id;
+    uint32_t flags;
+} __attribute__((packed));
+
+#define MADT_CORE_PIC_ENABLED        ((uint32_t)1 << 0)
+#define MADT_CORE_PIC_ONLINE_CAPABLE ((uint32_t)1 << 1)
+
 uint8_t acpi_checksum(void *ptr, size_t size);
 void   *acpi_get_rsdp(void);
 
diff --git a/common/mm/vmm.h b/common/mm/vmm.h
index 01cf7e6b..f6402f25 100644
--- a/common/mm/vmm.h
+++ b/common/mm/vmm.h
@@ -123,11 +123,20 @@ void map_page(pagemap_t pagemap, uint64_t virt_addr, uint64_t phys_addr, uint64_
 
 #elif defined (__loongarch64)
 
-#define paging_mode_va_bits(mode) 48
+static inline uint32_t read_cpucfg(uint32_t reg) {
+    uint32_t val = 0;
+    asm volatile("cpucfg %0, %1\n\t"
+        :"=r"(val)
+        :"r"(reg)
+    );
+    return val;
+}
+
+#define paging_mode_va_bits(mode) (((read_cpucfg(0x1) >> 12) & 0xFF) + 1)
 
 static inline uint64_t paging_mode_higher_half(int paging_mode) {
     (void)paging_mode;
-    return 0xffff800000000000;
+    return 0UL - (1UL << (paging_mode_va_bits(paging_mode) - 1));
 }
 
 // We use fake flags here because these don't properly map onto the
diff --git a/common/protos/limine.c b/common/protos/limine.c
index 959261fb..140fe5b7 100644
--- a/common/protos/limine.c
+++ b/common/protos/limine.c
@@ -1505,8 +1505,8 @@ FEAT_START
 #elif defined (__riscv)
     mp_info = init_smp(&cpu_count, pagemap, direct_map_offset);
 #elif defined (__loongarch64)
-    cpu_count = 0;
-    mp_info = NULL; // TODO: LoongArch MP
+    uint32_t bsp_phys_id;
+    mp_info = init_smp(&cpu_count, &bsp_phys_id, pagemap, direct_map_offset);
 #else
 #error Unknown architecture
 #endif
@@ -1529,6 +1529,9 @@ FEAT_START
             continue;
         }
 #elif defined (__loongarch64)
+        if (mp_info[i].phys_id == bsp_phys_id) {
+            continue;
+        }
 #else
 #error Unknown architecture
 #endif
@@ -1549,6 +1552,7 @@ FEAT_START
 #elif defined (__riscv)
     mp_response->bsp_hartid = bsp_hartid;
 #elif defined (__loongarch64)
+    mp_response->bsp_phys_id = bsp_phys_id;
 #else
 #error Unknown architecture
 #endif
diff --git a/common/sys/cpu.h b/common/sys/cpu.h
index 5ab2757e..f9420c71 100644
--- a/common/sys/cpu.h
+++ b/common/sys/cpu.h
@@ -374,6 +374,92 @@ void init_riscv(const char *config);
 
 #elif defined (__loongarch64)
 
+#define csr_read64(reg) ({ \
+    uint64_t csr_read64__ret; \
+    asm volatile ( \
+        "csrrd %0, %1" \
+        : "=r"(csr_read64__ret) \
+        : "i"(reg) \
+    ); \
+    csr_read64__ret; \
+})
+
+#define csr_write64(val, reg) do { \
+    __auto_type csr_write64__val = (val); \
+    asm volatile ( \
+        "csrwr %0, %1" \
+        : \
+        : "r"(csr_write64__val), "i"(reg) \
+        : "memory" \
+    ); \
+} while (0)
+
+#define csr_read32(reg) ((uint32_t)csr_read64(reg))
+
+#define csr_write32(val, reg) do { \
+    csr_write64((uint64_t)(val), reg); \
+} while (0)
+
+#define csr_xchg64(val, mask, reg) ({ \
+    uint64_t csr_xchg64__ret = (uint64_t)(val); \
+    uint64_t csr_xchg64__mask = (uint64_t)(mask); \
+    asm volatile ( \
+        "csrxchg %0, %1, %2" \
+        : "+r"(csr_xchg64__ret) \
+        : "r"(csr_xchg64__mask), "i"(reg) \
+        : "memory" \
+    ); \
+    csr_xchg64__ret; \
+})
+
+#define locked_read(var) ({ \
+    typeof(*var) locked_read__ret; \
+    asm volatile ( \
+        "ld.d %0, %1\n\t" \
+        "dbar 0" \
+        : "=r"(locked_read__ret) \
+        : "m"(*(var)) \
+        : "memory" \
+    ); \
+    locked_read__ret; \
+})
+
+static inline uint32_t iocsr_read32(uint64_t reg) {
+    uint32_t val;
+    asm volatile (
+        "iocsrrd.w %0, %1"
+        : "=r"(val)
+        : "r"(reg)
+    );
+    return val;
+}
+
+static inline void iocsr_write32(uint32_t val, uint64_t reg) {
+    asm volatile (
+        "iocsrwr.w %0, %1"
+        :
+        : "r"(val), "r"(reg)
+    );
+}
+
+static inline uint64_t iocsr_read64(uint64_t reg) {
+    uint64_t val;
+    asm volatile (
+        "iocsrrd.d %0, %1"
+        : "=r"(val)
+        : "r"(reg)
+    );
+    return val;
+}
+
+static inline void iocsr_write64(uint64_t val, uint64_t reg) {
+    asm volatile (
+        "iocsrwr.d %0, %1"
+        :
+        : "r"(val), "r"(reg)
+    );
+}
+
 static inline uint64_t rdtsc(void) {
     uint64_t v;
     asm volatile ("rdtime.d %0, $zero" : "=r" (v));
diff --git a/common/sys/smp.c b/common/sys/smp.c
index e3519be2..1778bdde 100644
--- a/common/sys/smp.c
+++ b/common/sys/smp.c
@@ -17,7 +17,7 @@
 #if defined (__riscv)
 #include <sys/sbi.h>
 #endif
-#if defined (__aarch64__)
+#if defined (__aarch64__) || defined(__loongarch__)
 #include <libfdt.h>
 #endif
 
@@ -853,6 +853,339 @@ struct limine_mp_info *init_smp(size_t *cpu_count, pagemap_t pagemap, uint64_t h
 }
 
 #elif defined (__loongarch64)
+
+enum {
+    LOONGARCH_CSR_CPUID = 0x20,
+
+    LOONGARCH_IOCSR_IPI_SEND = 0x1040,
+    LOONGARCH_IOCSR_MBUF_SEND = 0x1048,
+
+    IOCSR_IPI_SEND_BLOCKING_BIT = 31,
+    IOCSR_IPI_SEND_CPU_SHIFT    = 16,
+    IOCSR_IPI_SEND_IP_SHIFT     = 0,
+
+    IOCSR_MBUF_SEND_BLOCKING_BIT = 31,
+    IOCSR_MBUF_SEND_CPU_SHIFT    = 16,
+    IOCSR_MBUF_SEND_BOX_SHIFT    = 2,
+
+    SMP_BOOT_CPU = 0x1,
+
+    MADT_ENTRY_CORE_PIC = 17
+};
+
+struct trampoline_passed_info {
+    uint64_t smp_tpl_booted_flag;
+    uint64_t smp_tpl_info_struct;
+    uint64_t smp_tpl_pgd_low;
+    uint64_t smp_tpl_pgd_high;
+    uint64_t smp_tpl_hhdm_offset;
+    uint64_t smp_tpl_temp_stack;
+};
+
+struct trampoline_passed_info loongarch_smp_passed_info;
+
+static inline uint32_t loongarch_phys_id(void) {
+    return csr_read32(LOONGARCH_CSR_CPUID);
+}
+
+static inline bool core_pic_startable(uint32_t flags) {
+    return (flags & MADT_CORE_PIC_ENABLED)
+        || (flags & MADT_CORE_PIC_ONLINE_CAPABLE);
+}
+
+static void csr_mail_send(uint64_t data, int cpu, int mailbox) {
+	uint64_t val;
+
+    // High 32bit
+	val = ((uint64_t)1 << IOCSR_MBUF_SEND_BLOCKING_BIT);
+	val |= (((mailbox << 1) + 1) << IOCSR_MBUF_SEND_BOX_SHIFT);
+	val |= (cpu << IOCSR_MBUF_SEND_CPU_SHIFT);
+	val |= (data & 0xFFFFFFFF00000000);
+	iocsr_write64(val, LOONGARCH_IOCSR_MBUF_SEND);
+
+    // Low 32bit
+	val = ((uint64_t)1 << IOCSR_MBUF_SEND_BLOCKING_BIT);
+	val |= ((mailbox << 1) << IOCSR_MBUF_SEND_BOX_SHIFT);
+	val |= (cpu << IOCSR_MBUF_SEND_CPU_SHIFT);
+	val |= (data << 32);
+	iocsr_write64(val, LOONGARCH_IOCSR_MBUF_SEND);
+};
+
+static void smp_send_ipi(uint32_t phys_id, uint32_t action) {
+    uint32_t val = ((uint32_t)1 << IOCSR_IPI_SEND_BLOCKING_BIT)
+                 | (phys_id << IOCSR_IPI_SEND_CPU_SHIFT)
+                 | (action << IOCSR_IPI_SEND_IP_SHIFT);
+
+    iocsr_write32(val, LOONGARCH_IOCSR_IPI_SEND);
+}
+
+static bool smp_start_ap(uint32_t phys_id, struct limine_mp_info *info_struct,
+                         uint64_t pgd_low, uint64_t pgd_high,
+                         uint64_t hhdm_offset) {
+    static void *temp_stack =NULL;
+    if (temp_stack == NULL) {
+        temp_stack = ext_mem_alloc(8192);
+    }
+
+    loongarch_smp_passed_info.smp_tpl_booted_flag = 0;
+    loongarch_smp_passed_info.smp_tpl_info_struct = (uint64_t)(uintptr_t)info_struct;
+    loongarch_smp_passed_info.smp_tpl_pgd_low     = pgd_low;
+    loongarch_smp_passed_info.smp_tpl_pgd_high    = pgd_high;
+    loongarch_smp_passed_info.smp_tpl_hhdm_offset = hhdm_offset;
+    loongarch_smp_passed_info.smp_tpl_temp_stack  = (uint64_t)(uintptr_t)temp_stack + 8192;
+
+    asm volatile ("dbar 0" ::: "memory");
+
+    uint64_t trampoline_entry = (uint64_t)(uintptr_t)smp_trampoline_start;
+
+    // Mailbox 0 and 1 carry the low and high 32 bits of the AP entry point.
+    csr_mail_send(trampoline_entry, phys_id, 0);
+    smp_send_ipi(phys_id, SMP_BOOT_CPU);
+
+    for (int i = 0; i < 1000000; i++) {
+        if (locked_read(&loongarch_smp_passed_info.smp_tpl_booted_flag) == 1)
+            return true;
+        delay(100000);
+    }
+
+    return false;
+}
+
+static struct limine_mp_info *try_acpi_smp(size_t *cpu_count, uint32_t *bsp_phys_id,
+                                           pagemap_t pagemap, uint64_t hhdm_offset) {
+    struct madt *madt = acpi_get_table("APIC", 0);
+    if (madt == NULL)
+        return NULL;
+
+    *bsp_phys_id = loongarch_phys_id();
+    *cpu_count = 0;
+
+    size_t max_cpus = 0;
+
+    for (uint8_t *madt_ptr = (uint8_t *)madt->madt_entries_begin;
+         (uintptr_t)madt_ptr + 1 < (uintptr_t)madt + madt->header.length;
+         madt_ptr += *(madt_ptr + 1)) {
+        if (*(madt_ptr + 1) == 0)
+            break;
+
+        if (*madt_ptr != MADT_ENTRY_CORE_PIC)
+            continue;
+
+        if (*(madt_ptr + 1) < sizeof(struct madt_core_pic))
+            continue;
+
+        struct madt_core_pic *core_pic = (void *)madt_ptr;
+
+        if (core_pic_startable(core_pic->flags))
+            max_cpus++;
+    }
+
+    if (max_cpus == 0)
+        return NULL;
+
+    struct limine_mp_info *ret = ext_mem_alloc(max_cpus * sizeof(struct limine_mp_info));
+
+    for (uint8_t *madt_ptr = (uint8_t *)madt->madt_entries_begin;
+         (uintptr_t)madt_ptr + 1 < (uintptr_t)madt + madt->header.length;
+         madt_ptr += *(madt_ptr + 1)) {
+        if (*(madt_ptr + 1) == 0)
+            break;
+
+        if (*madt_ptr != MADT_ENTRY_CORE_PIC)
+            continue;
+
+        if (*(madt_ptr + 1) < sizeof(struct madt_core_pic))
+            continue;
+
+        struct madt_core_pic *core_pic = (void *)madt_ptr;
+
+        if (!core_pic_startable(core_pic->flags))
+            continue;
+
+        struct limine_mp_info *info_struct = &ret[*cpu_count];
+        info_struct->processor_id = core_pic->acpi_processor_uid;
+        info_struct->phys_id = core_pic->core_id;
+
+        // Do not try to restart the BSP.
+        if (core_pic->core_id == *bsp_phys_id) {
+            (*cpu_count)++;
+            continue;
+        }
+
+        printv("smp: Found candidate AP for bring-up. Core ID: %u\n", core_pic->core_id);
+
+        if (!smp_start_ap(core_pic->core_id, info_struct,
+                          (uint64_t)(uintptr_t)pagemap.pgd[0],
+                          (uint64_t)(uintptr_t)pagemap.pgd[1],
+                          hhdm_offset)) {
+            print("smp: FAILED to bring-up AP\n");
+            continue;
+        }
+
+        printv("smp: Successfully brought up AP\n");
+        (*cpu_count)++;
+    }
+
+    if (*cpu_count == 0) {
+        pmm_free(ret, max_cpus * sizeof(struct limine_mp_info));
+        return NULL;
+    }
+
+    return ret;
+}
+
+static struct limine_mp_info *try_dtb_smp(void *dtb, size_t *cpu_count,
+                                          uint32_t *bsp_phys_id,
+                                          pagemap_t pagemap,
+                                          uint64_t hhdm_offset) {
+    int cpus = fdt_path_offset(dtb, "/cpus");
+    if (cpus < 0) {
+        printv("smp: failed to find /cpus node: %s\n", fdt_strerror(cpus));
+        return NULL;
+    }
+
+    int address_cells = fdt_address_cells(dtb, cpus);
+    if (address_cells < 1) {
+        printv("smp: fdt_address_cells failed: %s\n", fdt_strerror(address_cells));
+        return NULL;
+    }
+    if (address_cells > 2) {
+        printv("smp: illegal #address-cells value: %d\n", address_cells);
+        return NULL;
+    }
+
+    *bsp_phys_id = loongarch_phys_id();
+    *cpu_count = 0;
+
+    size_t max_cpus = 0;
+    int node;
+    fdt_for_each_subnode(node, dtb, cpus) {
+        const void *prop;
+        int prop_len;
+
+        if (!(prop = fdt_getprop(dtb, node, "device_type", NULL)) || strcmp(prop, "cpu"))
+            continue;
+
+        if (!(prop = fdt_getprop(dtb, node, "reg", &prop_len)) || prop_len < address_cells * 4)
+            continue;
+
+        uint64_t phys_id = 0;
+        const uint8_t *bytes = prop;
+
+        if (address_cells == 1) {
+            phys_id = ((uint64_t)bytes[0] << 24)
+                    | ((uint64_t)bytes[1] << 16)
+                    | ((uint64_t)bytes[2] << 8)
+                    | ((uint64_t)bytes[3] << 0);
+        } else {
+            phys_id = ((uint64_t)bytes[0] << 56)
+                    | ((uint64_t)bytes[1] << 48)
+                    | ((uint64_t)bytes[2] << 40)
+                    | ((uint64_t)bytes[3] << 32)
+                    | ((uint64_t)bytes[4] << 24)
+                    | ((uint64_t)bytes[5] << 16)
+                    | ((uint64_t)bytes[6] << 8)
+                    | ((uint64_t)bytes[7] << 0);
+        }
+
+        if (phys_id > UINT32_MAX) {
+            printv("smp: core id %U does not fit in 32 bits, skipping\n", phys_id);
+            continue;
+        }
+
+        max_cpus++;
+    }
+
+    if (max_cpus == 0)
+        return NULL;
+
+    struct limine_mp_info *ret = ext_mem_alloc(max_cpus * sizeof(struct limine_mp_info));
+
+    fdt_for_each_subnode(node, dtb, cpus) {
+        const void *prop;
+        int prop_len;
+
+        if (!(prop = fdt_getprop(dtb, node, "device_type", NULL)) || strcmp(prop, "cpu"))
+            continue;
+
+        if (!(prop = fdt_getprop(dtb, node, "reg", &prop_len)) || prop_len < address_cells * 4)
+            continue;
+
+        uint64_t phys_id = 0;
+        const uint8_t *bytes = prop;
+
+        if (address_cells == 1) {
+            phys_id = ((uint64_t)bytes[0] << 24)
+                    | ((uint64_t)bytes[1] << 16)
+                    | ((uint64_t)bytes[2] << 8)
+                    | ((uint64_t)bytes[3] << 0);
+        } else {
+            phys_id = ((uint64_t)bytes[0] << 56)
+                    | ((uint64_t)bytes[1] << 48)
+                    | ((uint64_t)bytes[2] << 40)
+                    | ((uint64_t)bytes[3] << 32)
+                    | ((uint64_t)bytes[4] << 24)
+                    | ((uint64_t)bytes[5] << 16)
+                    | ((uint64_t)bytes[6] << 8)
+                    | ((uint64_t)bytes[7] << 0);
+        }
+
+        if (phys_id > UINT32_MAX) {
+            printv("smp: core id %U does not fit in 32 bits, skipping\n", phys_id);
+            continue;
+        }
+
+        struct limine_mp_info *info_struct = &ret[*cpu_count];
+        info_struct->processor_id = 0;
+        info_struct->phys_id = phys_id;
+
+        // Do not try to restart the BSP.
+        if (phys_id == *bsp_phys_id) {
+            (*cpu_count)++;
+            continue;
+        }
+
+        printv("smp: Found candidate AP for bring-up. Core ID: %U\n", phys_id);
+
+        if (!smp_start_ap((uint32_t)phys_id, info_struct,
+                          (uint64_t)(uintptr_t)pagemap.pgd[0],
+                          (uint64_t)(uintptr_t)pagemap.pgd[1],
+                          hhdm_offset)) {
+            print("smp: FAILED to bring-up AP\n");
+            continue;
+        }
+
+        printv("smp: Successfully brought up AP\n");
+        (*cpu_count)++;
+    }
+
+    if (*cpu_count == 0) {
+        pmm_free(ret, max_cpus * sizeof(struct limine_mp_info));
+        return NULL;
+    }
+
+    return ret;
+}
+
+struct limine_mp_info *init_smp(size_t *cpu_count, uint32_t *bsp_phys_id,
+                                pagemap_t pagemap, uint64_t hhdm_offset) {
+    struct limine_mp_info *info = NULL;
+
+    if (acpi_get_rsdp() && (info = try_acpi_smp(cpu_count, bsp_phys_id, pagemap, hhdm_offset)))
+        return info;
+
+    void *dtb = get_device_tree_blob(NULL, 0);
+    if (dtb) {
+        info = try_dtb_smp(dtb, cpu_count, bsp_phys_id, pagemap, hhdm_offset);
+        pmm_free(dtb, fdt_totalsize(dtb));
+        return info;
+    }
+
+    printv("Failed to figure out how to start APs.");
+
+    return NULL;
+}
+
 #else
 #error Unknown architecture
 #endif
diff --git a/common/sys/smp.h b/common/sys/smp.h
index de45f08c..743ce009 100644
--- a/common/sys/smp.h
+++ b/common/sys/smp.h
@@ -37,6 +37,10 @@ struct limine_mp_info *init_smp(size_t   *cpu_count,
                                  uint64_t  hhdm_offset);
 
 #elif defined (__loongarch64)
+
+struct limine_mp_info *init_smp(size_t *cpu_count, uint32_t *bsp_phys_id,
+                                pagemap_t pagemap, uint64_t hhdm_offset);
+
 #else
 #error Unknown architecture
 #endif
diff --git a/common/sys/smp_trampoline.asm_loongarch64 b/common/sys/smp_trampoline.asm_loongarch64
new file mode 100644
index 00000000..a825a1ed
--- /dev/null
+++ b/common/sys/smp_trampoline.asm_loongarch64
@@ -0,0 +1,76 @@
+.section .text
+
+.set tpl_booted_flag,   0
+.set tpl_info_struct,   8
+.set tpl_pgd_low,       16
+.set tpl_pgd_high,      24
+.set tpl_hhdm_offset,   32
+.set tpl_temp_stack,    40
+
+.global smp_trampoline_start
+.extern loongarch_spinup
+.extern loongarch_smp_passed_info
+
+smp_trampoline_start:
+        // Load trampoline state and switch to the kernel page tables.
+        la.local $t0, loongarch_smp_passed_info
+        la.local $a0, 1f
+        ld.d    $a1, $t0, tpl_temp_stack
+        ld.d    $a2, $t0, tpl_pgd_low
+        ld.d    $a3, $t0, tpl_pgd_high
+        b       loongarch_spinup
+
+1:
+        // Relocate the MP info pointer to the higher-half direct map.
+        la.local $t0, loongarch_smp_passed_info
+        ld.d    $t1, $t0, tpl_info_struct
+        ld.d    $t2, $t0, tpl_hhdm_offset
+        add.d   $t1, $t1, $t2
+        move    $a0, $t1
+
+        // Tell the BSP that this AP reached the parking loop.
+        li.d    $t3, 1
+        st.d    $t3, $t0, tpl_booted_flag
+        dbar    0
+
+2:
+        // Wait until the kernel sets goto_address.
+        ld.d    $t8, $t1, 24
+        dbar    0
+        beqz    $t8, 2b
+
+        // Load the kernel-provided stack (reserved field).
+        ld.d    $sp, $t1, 16
+
+        // Match the other trampolines by zeroing non-essential registers.
+        move    $ra, $zero
+        move    $tp, $zero
+        move    $a1, $zero
+        move    $a2, $zero
+        move    $a3, $zero
+        move    $a4, $zero
+        move    $a5, $zero
+        move    $a6, $zero
+        move    $a7, $zero
+        move    $t0, $zero
+        move    $t1, $zero
+        move    $t2, $zero
+        move    $t3, $zero
+        move    $t4, $zero
+        move    $t5, $zero
+        move    $t6, $zero
+        move    $t7, $zero
+        move    $fp, $zero
+        move    $s0, $zero
+        move    $s1, $zero
+        move    $s2, $zero
+        move    $s3, $zero
+        move    $s4, $zero
+        move    $s5, $zero
+        move    $s6, $zero
+        move    $s7, $zero
+        move    $s8, $zero
+
+        jirl    $zero, $t8, 0
+
+.section .note.GNU-stack,"",%progbits
tab: 248 wrap: offon