plist 居然比 alist 慢

但其实 plist 和 alist 都需要访问 2N 个 cons,而且上面 perf 结果也显示二者所需的执行指令数差异不大,可能和 list 长度关系不大。主要困惑的点是二者指令的执行效率差异太大了。

的确不是对齐的问题:试用了下 aider,简单用 C 实现了 consalist-getplist-get,仍然有显著的速度差异,即使调转 car 和 cdr 的顺序让 cdr 放在前面进行 16byte 对齐。

代码

GitHub - gudzpoz/alist-plist-benchmark: Benchmarking alist/plist performance with Linux perf.

#define CONS_FIELD(name, alignment)             \
    union {                                     \
        struct cons *pointer;                   \
        uint64_t value;                         \
    } name __attribute__((aligned(alignment)))

// Aligned cons cell structure matching Emacs' memory layout
struct cons {
#ifdef CDR_FIRST
    CONS_FIELD(cdr, 16);
    CONS_FIELD(car, 8);
#else
    CONS_FIELD(car, 16);
    CONS_FIELD(cdr, 8);
#endif
};

// Alist lookup (association list)
uint64_t __attribute__((noinline)) alist_get(struct cons *list, uint64_t key) {
    while (list != NULL) {
        struct cons *pair = list->car.pointer;
        if (pair->car.value == key) {
            return pair->cdr.value;
        }
        list = list->cdr.pointer;
    }
    return -1;
}

// Plist lookup (property list)
uint64_t __attribute__((noinline)) plist_get(struct cons *list, uint64_t key) {
    while (list != NULL) {
        if (list->car.value == key) {
            return list->cdr.pointer->car.value;
        }
        list = list->cdr.pointer->cdr.pointer;
    }
    return -1;
}

把 car 和 cdr 互换后,二者的汇编可以说是指令级的一致:

alist 循环 plist 循环 一致?
mov (%rdi), %rax mov (%rdi), %rax :green_square:
cmp %rsi, (%rax) cmp %rsi, %x8(%rdi) :multiply:
je :break_loop je :break_loop :green_square:
mov 0x8(%rdi), %rdi mov (%rax), %rdi :multiply:
test %rdi, %rdi test %rdi, %rdi :green_square:
jne :loop_start jne :loop_start :green_square:

我是想不明白了,或许和指令间的数据依赖导致无法流水线并行有关?

1 个赞