ICS - Arch Lab | 处理器太慢怎么办？

Last updated on December 17, 2025 am

在 Arch Lab 中，我们将熟悉流水线 Y86-64 处理器的设计与实现，并对该处理器及一个基准测试程序进行优化，以实现性能最大化。实验分为三个部分：

Part A：编写一些简单的 Y86-64 程序，并熟悉 Y86-64 相关工具的使用。
Part B：为 SEQ 模拟器扩展新的指令并完善 CPU 流水线的实现过程。
Part C：对 Y86-64 基准测试程序及其架构设计进行优化。

预备知识

Y86-64 指令集架构

Y86-64 是基于 x86-64 简化而来的指令集，定义了指令格式、寄存器组、寻址方式及程序状态码，是 Arch Lab 中处理器设计的指令规范。

HCL 硬件描述语言

HCL 是用于描述硬件逻辑的简化语言，重点支持组合逻辑与时序逻辑表达，是实现处理器模拟器控制逻辑的核心工具。

SEQ 架构

SEQ 是基础的单周期处理器架构，每条指令的取指、译码、执行等阶段串行完成，是理解流水线架构的基础模型。

PIPE 架构

PIPE 是将指令执行拆解为多个流水线阶段并行处理的架构，可提升指令吞吐量，需解决数据冒险、控制冒险等关键问题。

前置准备

环境配置

参考指导文档进行环境配置。

1
2
3

curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
rustup install 1.90
rustup default 1.90

在每次更改 Rust 源码后，应重新构建项目。

1 2	`cd archlab-project cargo build`

工具集

构建项目后，将产生一个 archlab-project/target 文件夹来存储输出的二进制文件和其他中间文件。输出的可执行文件是：

Y86-64 汇编器：target/debug/yas
Y86-64 调试器：target/debug/ydb
Y86-64 ISA 模拟器：target/debug/yis
Y86-64 流水线模拟器：target/debug/ysim
本地评分器：target/debug/grader

可通过以下命令启动本地评分器：

cd archlab-project
cargo build
./target/debug/grader part-a
./target/debug/grader part-b
./target/debug/grader part-c
./target/debug/grader autolab

其余工具详情可参看 archlab-project/README.md。

git init

为了便于管理代码版本，笔者建议在 archlab-project 目录下初始化一个 Git 仓库：

1 2	`cd archlab-project git init`

用于追踪代码变更。

Part A

Part A 中，我们将在 archlab-project/misc 目录下开展工作。我们的任务是编写三个 Y86-64 程序实现三个函数。

sum.ys

参照给出的示例代码：

/* linked list element */
typedef struct ELE {
    long val;
    struct ELE *next;
} *list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
    long val = 0;
    while (ls) {
    val += ls->val;
    ls = ls->next;
    }
    return val;
}

这是一段链表的求和函数，我们可以在 sum.ys 中编写如下 Y86-64 程序：

    .pos 0                  ; 设置当前汇编地址为0（程序起始地址）
    irmovq stack, %rsp      ; 初始化栈指针%rsp，指向stack标签处（栈起始地址）
    call main               ; 调用main函数
    halt                    ; 程序执行结束，停机

    .align 8                ; 后续数据按8字节对齐
ele1:                       ; 链表第一个节点
    .quad 0x00d
    .quad ele2
ele2:                       ; 链表第二个节点
    .quad 0x0e0
    .quad ele3
ele3:                       ; 链表第三个节点
    .quad 0xf00
    .quad 0

main:                       ; main函数入口
    irmovq ele1, %rdi       ; 将第一个链表节点ele1的地址存入%rdi
    call sum_list           ; 调用sum_list函数
    ret

sum_list:                   ; sum_list函数入口
    xorq %rax, %rax         ; 将%rax清零
Loop:                       ; 遍历链表
    andq %rdi, %rdi         ; 检查%rdi是否为0
    je Done                 ; 若%rdi为0（链表结束），跳转到Done
    mrmovq (%rdi), %rcx     ; 从%rdi指向的地址（当前节点的val字段）读取值到%rcx
    addq %rcx, %rax         ; 将%rcx（当前节点val）加到%rax
    mrmovq 8(%rdi), %rdi    ; 从%rdi+8地址（当前节点的next字段）读取下一个节点地址到%rdi
    jmp Loop                ; 跳回Loop，继续处理下一个节点
Done:                       ; 遍历结束
    ret

    .pos 0x200              ; 设置栈的起始地址为0x200（确保栈有足够空间）
stack:                      ; 栈起始位置标记

rsum.ys

参照给出的示例代码：

/* linked list element */
typedef struct ELE {
    long val;
    struct ELE *next;
} *list_ptr;

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
    if (!ls)
    return 0;
    else {
    long val = ls->val;
    long rest = rsum_list(ls->next);
    return val + rest;
    }
}

这是一段链表的递归求和函数，我们可以在 rsum.ys 中编写如下 Y86-64 程序：

    .pos 0                  ; 设置当前汇编地址为0（程序起始地址）
    irmovq stack, %rsp      ; 初始化栈指针%rsp，指向stack标签处（栈起始地址）
    call main               ; 调用main函数
    halt                    ; 程序执行结束，停机

    .align 8                ; 后续数据按8字节对齐
ele1:                       ; 链表第一个节点
    .quad 0x00d
    .quad ele2
ele2:                       ; 链表第二个节点
    .quad 0x0e0
    .quad ele3
ele3:                       ; 链表第三个节点
    .quad 0xf00
    .quad 0

main:                       ; main函数入口
    irmovq ele1, %rdi       ; 将第一个链表节点ele1的地址存入%rdi
    call rsum_list          ; 调用递归求和函数rsum_list
    ret

rsum_list:                  ; rsum_list函数入口
    andq %rdi, %rdi         ; 检查%rdi（当前链表节点地址）是否为0（判断是否为空节点）
    je Else                 ; 若为空节点（链表结束），跳转到Else（递归终止条件）
    pushq %rbx              ; 保存%rbx（被调用者保存寄存器，递归调用中需保留其值，避免被后续调用覆盖）
    mrmovq (%rdi), %rbx     ; 从当前节点（%rdi指向）的val字段读取值到%rbx（暂存当前节点的值）
    mrmovq 8(%rdi), %rdi    ; 从当前节点的next字段（%rdi+8地址）读取下一个节点地址到%rdi（更新参数为下一个节点）
    call rsum_list          ; 递归调用rsum_list，计算剩余链表的和（结果将存于%rax）
    addq %rbx, %rax         ; 将当前节点的值（%rbx）加到剩余链表的和（%rax）中，得到当前链表的总和
    popq %rbx               ; 恢复%rbx的值（从栈中弹出之前保存的值）
    ret                     ; 返回当前计算的总和（结果在%rax中）
Else:                       ; 递归终止条件标签（空链表处理）
    xorq %rax, %rax         ; 将%rax清零（空链表的和为0，作为递归的基线返回值）
    ret                     ; 返回0（结果在%rax中）

    .pos 0x200              ; 设置栈的起始地址为0x200（确保栈有足够空间）
stack:                      ; 栈起始位置标记

bubble.ys

参照给出的示例代码：

/* bubble_sort - Sort long numbers at data in ascending order */
void bubble_sort(long *data, long count)
{
     long *i, *last;
     for(last = data + count - 1; last > data; last--) {
         for(i = data; i < last; i++) {
             if(*(i + 1) < *i) {
                 long t = *(i + 1);
                 *(i + 1) = *i;
                 *i = t;
             }
         }
    }
}

这是一段冒泡排序函数，我们可以在 bubble.ys 中编写如下 Y86-64 程序：

    .pos 0                  ; 设置当前汇编地址为0（程序起始地址）
    irmovq  stack, %rsp     ; 初始化栈指针%rsp，指向stack标签处（栈起始地址）
    call main               ; 调用main函数（跳转到main执行，返回地址压栈）
    halt                    ; 程序执行结束，停机

    .align 8                ; 后续数据按8字节对齐
array:                      ; 待排序的数组（存储6个long类型元素）
    .quad 0xbca             ; 数组元素1：0xbca
    .quad 0xcba             ; 数组元素2：0xcba
    .quad 0xacb             ; 数组元素3：0xacb
    .quad 0xcab             ; 数组元素4：0xcab
    .quad 0xabc             ; 数组元素5：0xabc
    .quad 0xbac             ; 数组元素6：0xbac

; 寄存器用途说明：
; %rdi：存放数组起始地址data
; %rsi：存放数组元素个数count
; %rbx：存放当前内层循环变量i（地址）
; %rcx：存放外层循环变量last（地址）
; %rdx：临时变量（用于计算、比较等）
; %r9：常量1（用于减法运算）
; %r10：常量8（每个long元素占8字节，用于地址偏移计算）
; %r11、%r12：临时存放数组元素（用于比较和交换）

bubble_sort:                ; 冒泡排序函数入口
    irmovq $1, %r9          ; 初始化%r9为1（用于count-1等减法）
    irmovq $8, %r10         ; 初始化%r10为8（每个元素8字节，地址偏移量）
    rrmovq %rdi, %rcx       ; 将数组起始地址data复制到%rcx（用于计算last初始地址）
    rrmovq %rsi, %rdx       ; 将元素个数count复制到%rdx（临时存储count）
    subq %r9, %rdx          ; %rdx = count - 1（计算last的索引偏移量）

Loop_1:                     ; 计算last初始地址的循环（last = data + (count-1)*8）
    andq %rdx, %rdx         ; 检查%rdx（剩余偏移次数）是否为0
    je Loop_2               ; 若偏移次数为0，说明last初始地址计算完成，跳至外层循环入口
    addq %r10, %rcx         ; %rcx += 8（每次偏移一个元素地址，累计(count-1)次）
    subq %r9, %rdx          ; %rdx -= 1（剩余偏移次数减1）
    jmp Loop_1              ; 继续循环计算last地址

Loop_2:                     ; 外层循环入口
    rrmovq %rcx, %rdx       ; 将当前last地址复制到%rdx
    subq %rdi, %rdx         ; %rdx = last - data（计算last与数组起始地址的偏移）
    jle Done_2              ; 若偏移<=0（last <= data），外层循环结束，跳至Done_2

    rrmovq %rdi, %rbx       ; 初始化内层循环变量i为data（%rbx = data）
Loop_3:                     ; 内层循环入口
    rrmovq %rcx, %rdx       ; 将当前last地址复制到%rdx
    subq %rbx, %rdx         ; %rdx = last - i（计算i与last的偏移）
    jle Done_3              ; 若偏移<=0（i >= last），内层循环结束，跳至Done_3

    mrmovq (%rbx), %r11     ; 读取i指向的元素：%r11 = *i
    mrmovq 8(%rbx), %r12    ; 读取i+1指向的元素：%r12 = *(i+1)

    subq %r12, %r11         ; 计算*i - *(i+1)（用于比较大小）
    jle Else                ; 若*i <= *(i+1)，无需交换，跳至Else

    mrmovq (%rbx), %r11     ; 重新读取*i
    mrmovq 8(%rbx), %r12    ; 重新读取*(i+1)
    rmmovq %r11, 8(%rbx)    ; 将*i存入i+1的位置：*(i+1) = *i
    rmmovq %r12, (%rbx)     ; 将*(i+1)存入i的位置：*i = *(i+1)

Else:
    addq %r10, %rbx         ; i += 1（地址加8，移动到下一个元素）
    jmp Loop_3              ; 跳回内层循环，继续下一次迭代

Done_3:                     ; 内层循环结束
    subq %r10, %rcx         ; last -= 1（地址减8，移动到前一个元素）
    jmp Loop_2              ; 跳回外层循环，继续下一次迭代

Done_2:                     ; 外层循环结束
    ret                     ; 从bubble_sort函数返回

main:                       ; main函数入口
    irmovq array, %rdi      ; 将数组起始地址array存入%rdi
    irmovq $6, %rsi         ; 将元素个数6存入%rsi
    call bubble_sort        ; 调用冒泡排序函数bubble_sort
    ret                     ; 从main函数返回（返回到call main的下一条指令，即halt）

    .pos 0x200              ; 设置栈的起始地址为0x200（确保栈有足够空间）
stack:                      ; 栈起始位置标记

Part B

Part B 中，我们将在 archlab-project/sim/src/architectures/extra 目录下开展工作。我们的任务是逐步完善一系列处理器架构。它们均使用 HCL 语言描述，我们只需修改相应的占位符为正确的表达式。占位符一共有三种：

BOOL_PLACEHOLDER
U8_PLACEHOLDER
U64_PLACEHOLDER

seq_full.rs

在 seq_full.rs 中，我们需要扩展 SEQ 处理器以支持以下指令：

iopq V, rB

Fetch 阶段，需要保证 IOPQ 是一条有效指令，需要读取寄存器和立即数。

bool instr_valid = icode in // CMOVX is the same as RRMOVQ
    { NOP, HALT, CMOVX, IRMOVQ, RMMOVQ, MRMOVQ,
    OPQ, IOPQ, JX, CALL, RET, PUSHQ, POPQ };

// Does fetched instruction require a regid byte?
bool need_regids =
    icode in { CMOVX, OPQ, IOPQ, PUSHQ, POPQ, IRMOVQ, RMMOVQ, MRMOVQ };

// Does fetched instruction require a constant word?
bool need_valC = icode in { IRMOVQ, RMMOVQ, MRMOVQ, JX, CALL, IOPQ };

Decode 阶段，与 OPQ 指令类似，需要将 rB 设置为 srcB 从寄存器文件中取值，并将写入地址 dstE 设置为 rB。

// What register should be used as the B source?
u8 srcB = [
    icode in { OPQ, RMMOVQ, MRMOVQ, IOPQ } : ialign.rB;
    icode in { PUSHQ, POPQ, CALL, RET } : RSP;
    true : RNONE; // Don't need register
];
// What register should be used as the E destination?
u8 dstE = [
    icode in { CMOVX } && cnd : ialign.rB;
    icode in { IRMOVQ, OPQ, IOPQ } : ialign.rB;
    icode in { PUSHQ, POPQ, CALL, RET } : RSP;
    true : RNONE; // Don't write any register
];

Execute 阶段，需要指定 ALU 的输入，并更新条件码。

// Select input A to ALU
u64 aluA = [
    icode in { CMOVX, OPQ } : reg_read.valA;
    icode in { IRMOVQ, RMMOVQ, MRMOVQ, IOPQ } : ialign.valC;
    icode in { CALL, PUSHQ } : NEG_8;
    icode in { RET, POPQ } : 8;
    // Other instructions don't need ALU
];

// Select input B to ALU
u64 aluB = [
    icode in { RMMOVQ, MRMOVQ, OPQ, IOPQ, CALL,
              PUSHQ, RET, POPQ } : reg_read.valB;
    icode in { CMOVX, IRMOVQ } : 0;
    // Other instructions don't need ALU
];

// Set the ALU function
u8 alufun = [
    icode == OPQ : ifun;
    icode == IOPQ : ifun;
    true : ADD;
];

// Should the condition codes be updated?
bool set_cc = icode in { OPQ, IOPQ };

其余阶段无需调整。

pipe_s3a.rs

在 pipe_s3a.rs 中，我们需要在 pipe_s2.rs 的基础上进一步优化。我们发现 pipe_s2 的关键路径长度为 7，而计算 f_pc 的依赖路径过长。为了减少 f_pc 的依赖路径长度，我们进一步拆分了流水线阶段，将解码阶段与随后执行、存储和写回阶段分开。然而，这样做引入了新的问题：

reg_read 和 reg_write 现在位于不同的流水线阶段，这可能导致数据冒险。
用于计算 f_pc 的操作数也被拆分到不同的流水线阶段，这实际上是不正确的，因为在特定时间点位于不同流水线阶段的值属于不同的指令。

为了解决上述问题，我们在必要时对流水线寄存器实施阻塞或冒泡控制，确保等待一个额外的周期以保证计算结果的正确性。

pipe_s3a.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Taken branch.  Use instruction constant
    // U8_PLACEHOLDER == JX && e_cnd : U64_PLACEHOLDER;
    E.icode == JX && e_cnd : E.valC;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : e_valM;
    E.icode == RET : e_valM;
    // Default: Use incremented PC
    1 : F.valP;
];
:=======================: "Execute Stage (Execute)" :==========================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// If a data hazard occurs, we need to wait for the data to be written to the
// registers before proceeding.
bool data_harzard = d_srcA != RNONE && d_srcA in { e_dstE, e_dstM }
    // || U8_PLACEHOLDER != RNONE && BOOL_PLACEHOLDER;
    || d_srcB != RNONE && d_srcB in { e_dstE, e_dstM };

// bool f_stall = D.icode in { JX, RET } || BOOL_PLACEHOLDER;
bool f_stall = D.icode in { JX, RET } || data_harzard;

// bool d_stall = BOOL_PLACEHOLDER;
bool d_stall = data_harzard;

pipe_s3b.rs

在 pipe_s3b.rs 中，我们需要在 pipe_s3a.rs 的基础上进一步优化。pipe_s3a 的一个问题是停顿和气泡的频率太高，这实际上降低了性能。除了使用停顿和气泡来处理数据冒险之外，也可以通过转发来解决。

pipe_s3b.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Taken branch.  Use instruction constant
    // U8_PLACEHOLDER == JX && e_cnd : U64_PLACEHOLDER;
    E.icode == JX && e_cnd : E.valC;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : e_valM;
    E.icode == RET : e_valM;
    // Default: Use incremented PC
    1 : F.valP;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : e_valM;
    d_srcA == e_dstE : e_valE;
    d_srcA == e_dstM : e_valM;
    1: reg_read.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == e_dstM : e_valM;
    1: reg_read.valB;
];
:=======================: "Execute Stage (Execute)" :==========================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// bool f_stall = D.icode in { U8_PLACEHOLDER, U8_PLACEHOLDER };
bool f_stall = D.icode in { JX, RET };

// bool d_bubble = D.icode in { U8_PLACEHOLDER, U8_PLACEHOLDER };
bool d_bubble = D.icode in { JX, RET };

pipe_s3c.rs

在 pipe_s3c.rs 中，我们需要在 pipe_s3b.rs 的基础上进一步优化。pipe_s3b 的性能提升不是很显著，因为当遇到 JX 指令时，我们总是会停顿/产生气泡。实际上，我们在解码阶段就已知两种分支选项，只是必须等到执行阶段才能确定选择哪条分支，我们可以基于此实现分支预测。

pipe_s3c.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Branch misprediction.  Use incremental PC
    // U8_PLACEHOLDER == JX && !e_cnd : U64_PLACEHOLDER;
    E.icode == JX && !e_cnd : E.valP;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : e_valM;
    E.icode == RET : e_valM;
    // Default: Use predicted PC
    1 : F.pred_pc;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : e_valM;
    d_srcA == e_dstE : e_valE;
    d_srcA == e_dstM : e_valM;
    1: reg_read.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == e_dstM : e_valM;
    1: reg_read.valB;
];
:=======================: "Execute Stage (Execute)" :==========================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// If a branch misprediction is detected during the Execute stage, it means that
// the instruction currently in the Decode stage is invalid. Therefore, the next
// cycle’s Execute stage needs to insert a bubble.
// bool branch_mispred = BOOL_PLACEHOLDER;
bool branch_mispred = E.icode == JX && !e_cnd;

// If both a branch misprediction and a RET hazard occur at the same time, since
// the jump instruction is executed before the RET, the RET should not have been
// executed, so a stall is not needed.
// bool f_stall = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER;
bool f_stall = ret_harzard && !branch_mispred;

// If both a branch misprediction and a ret hazard occur at the same time,
// since the jump instruction is executed before the RET, the RET should not
// have been executed. Therefore, a bubble is not needed.
// bool d_bubble = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER;
bool d_bubble = ret_harzard && !branch_mispred;

pipe_s3d.rs

在 pipe_s3d.rs 中，我们需要在 pipe_s3c.rs 的基础上进一步优化。从 pipe_s3c 到 pipe_s3d 的主要变化在于硬件设计。我们将寄存器读取和写入合并到一个名为 reg_file 的设备中，操作顺序为先写后读。此更改的目的是为了避免结构风险。

pipe_s3d.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Branch misprediction.  Use incremental PC
    // U8_PLACEHOLDER == JX && !e_cnd : U64_PLACEHOLDER;
    E.icode == JX && !e_cnd : E.valP;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : e_valM;
    E.icode == RET : e_valM;
    // Default: Use predicted PC
    1 : F.pred_pc;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : e_valM;
    d_srcA == e_dstE : e_valE;
    d_srcA == e_dstM : e_valM;
    1: reg_file.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == e_dstM : e_valM;
    1: reg_file.valB;
];
:=======================: "Execute Stage (Execute)" :==========================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// If a branch misprediction is detected during the Execute stage, it means that
// the instruction currently in the Decode stage is invalid. Therefore, the next
// cycle’s Execute stage needs to insert a bubble.
// bool branch_mispred = BOOL_PLACEHOLDER;
bool branch_mispred = E.icode == JX && !e_cnd;

// If both a branch misprediction and a RET hazard occur at the same time, since
// the jump instruction is executed before the RET, the RET should not have been
// executed, so a stall is not needed.
// bool f_stall = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER;
bool f_stall = ret_harzard && !branch_mispred;

// If both a branch misprediction and a ret hazard occur at the same time,
// since the jump instruction is executed before the RET, the RET should not
// have been executed. Therefore, a bubble is not needed.
// bool d_bubble = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER;
bool d_bubble = ret_harzard && !branch_mispred;

pipe_s4a.rs

在 pipe_s4a.rs 中，我们需要在 pipe_s3d.rs 的基础上进一步优化。为了进一步减少计算依赖层级，我们已经将执行阶段与其他阶段分离。实际上，通过在 pipe_s3d 中强制对寄存器执行先写后读的顺序，我们可以避免可能在 pipe_s4 中发生的结构性冒险。

pipe_s4a.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Branch misprediction.  Use incremental PC
    // U8_PLACEHOLDER == JX && !e_cnd : U64_PLACEHOLDER;
    E.icode == JX && !e_cnd : E.valP;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : U64_PLACEHOLDER;
    M.icode == RET : m_valM;
    // Default: Use predicted PC
    1 : F.pred_pc;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcA == e_dstE : e_valE;
    d_srcA == m_dstM : m_valM;
    1: reg_file.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == m_dstM : m_valM;
    1: reg_file.valB;
];
:=============================: Execute Stage :================================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// If a branch misprediction is detected during the Execute stage, it means that
// the instruction currently in the Decode stage is invalid. Therefore, the next
// cycle’s Execute stage needs to insert a bubble.
// bool branch_mispred = BOOL_PLACEHOLDER;
bool branch_mispred = E.icode == JX && !e_cnd;

// If a RET instruction is detected in either the Decode or Execute stage, then
// the instruction in the current Fetch stage is invalid. Therefore, a bubble
// needs to be inserted in the Fetch stage for the next cycle.
//
// In fact, when E.icode == RET, the instruction in the current Decode stage is
// also invalid, but because the D.icode in the previous cycle was RET, at this
// point D.icode == NOP, so there's no need to add a condition for e_bubble.
// bool ret_harzard = RET in { U8_PLACEHOLDER, U8_PLACEHOLDER };
bool ret_harzard = RET in { D.icode, E.icode };

// If both a branch misprediction and a RET hazard occur at the same time,
// since the jump instruction is executed before the RET, the RET should not be
// executed, so no stall is required.
// bool f_stall = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER || BOOL_PLACEHOLDER;
bool f_stall = ret_harzard && !branch_mispred || load_use_harzard;

// bool d_stall = BOOL_PLACEHOLDER;
bool d_stall = load_use_harzard;

// If both a branch misprediction and a RET hazard occur at the same time,
// since the jump instruction is executed before the RET, the RET should not
// have been executed, so a bubble is not needed.
//
// Actually, ret_harzard and d_stall cannot be true at the same time.
// bool d_bubble = BOOL_PLACEHOLDER && !BOOL_PLACEHOLDER && !d_stall;
bool d_bubble = ret_harzard && !branch_mispred && !d_stall;

// bool e_bubble = branch_mispred || BOOL_PLACEHOLDER;
bool e_bubble = branch_mispred || load_use_harzard;

pipe_s4b.rs

在 pipe_s4b.rs 中，我们需要在 pipe_s4a.rs 的基础上进一步优化。我们注意到用于计算 f_pc 的 e_cnd 的依赖路径太长。因此，我们可以将 e_cnd 存储在存储阶段寄存器中，并用 M.cnd 替代它。

pipe_s4b.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Branch misprediction.  Use incremental PC
    // U8_PLACEHOLDER == JX && !M.cnd : U64_PLACEHOLDER;
    M.icode == JX && !M.cnd : M.valP;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : U64_PLACEHOLDER;
    M.icode == RET : m_valM;
    // Default: Use predicted PC
    1 : F.pred_pc;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcA == e_dstE : e_valE;
    d_srcA == m_dstM : m_valM;
    1: reg_file.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == m_dstM : m_valM;
    1: reg_file.valB;
];
:=============================: Execute Stage :================================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:========================: Pipeline Register Control :=========================:
// If a branch misprediction is detected during the Execute stage, it means that
// the instruction currently in the Decode stage is invalid. Therefore, the next
// cycle’s Execute stage needs to insert a bubble.
// bool branch_mispred = BOOL_PLACEHOLDER;
bool branch_mispred = E.icode == JX && !e_cnd;

// If a RET instruction is detected in either the Decode or Execute stage, then
// the instruction in the current Fetch stage is invalid. Therefore, a bubble
// needs to be inserted in the Fetch stage for the next cycle.
//
// In fact, when E.icode == RET, the instruction in the current Decode stage is
// also invalid, but because the D.icode in the previous cycle was RET, at this
// point D.icode == NOP, so there's no need to add a condition for e_bubble.
// bool ret_harzard = RET in { U8_PLACEHOLDER, U8_PLACEHOLDER };
bool ret_harzard = RET in { D.icode, E.icode };

// Unlike in `pipe_s4a`, here we do not need to consider the case of branch
// misprediction (since in the next cycle, Fetch will always get the `f_pc`
// from `M.valP`).
// bool f_stall = BOOL_PLACEHOLDER || BOOL_PLACEHOLDER;
bool f_stall = ret_harzard || load_use_harzard;

// bool d_stall = BOOL_PLACEHOLDER;
bool d_stall = load_use_harzard;

// Unlike in `pipe_s4a`, when a branch misprediction occurs, we directly insert
// a bubble, because the instruction in the Fetch stage at that point is invalid.
// Actually, ret_harzard and d_stall cannot be true at the same time.
// bool d_bubble = BOOL_PLACEHOLDER || BOOL_PLACEHOLDER && !d_stall;
bool d_bubble = branch_mispred || ret_harzard && !d_stall;

// bool e_bubble = branch_mispred || BOOL_PLACEHOLDER;
bool e_bubble = branch_mispred || load_use_harzard;

pipe_s4c.rs

在 pipe_s4c.rs 中，我们需要在 pipe_s4b.rs 的基础上进一步优化。类似于 pipe_std.rs，我们在解码阶段将 D.valP 输入到 d_valA 中，以消除执行和存储阶段流水线寄存器中的 valP。

pipe_s4c.rs 的占位符修改如下：

:==============================: Fetch Stage :================================:
u64 f_pc = [
    // Call.  Use instruction constant
    // If the previous instruction is CALL, the constant value should be the next PC
    // valC is from Fetch Stage, thus the last cycle
    D.icode == CALL : D.valC;
    // Branch misprediction.  Use incremental PC
    // U8_PLACEHOLDER == JX && !M.cnd : U64_PLACEHOLDER;
    M.icode == JX && !M.cnd : M.valA;
    // Completion of RET instruction.  Use value from stack
    // valM is from DEMW stage, thus the current cycle
    // U8_PLACEHOLDER == RET : U64_PLACEHOLDER;
    M.icode == RET : m_valM;
    // Default: Use predicted PC
    1 : F.pred_pc;
];
:==============================: Decode Stage :================================:
u64 d_valA = [
    D.icode in { CALL, JX } : D.valP;
    // d_srcA == U8_PLACEHOLDER : e_valE;
    // d_srcA == U8_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcA == e_dstE : e_valE;
    d_srcA == m_dstM : m_valM;
    1: reg_file.valA;
];
u64 d_valB = [
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    // BOOL_PLACEHOLDER : U64_PLACEHOLDER;
    d_srcB == e_dstE : e_valE;
    d_srcB == m_dstM : m_valM;
    1: reg_file.valB;
];
:=============================: Execute Stage :================================:
u8 e_dstE = [
    // U8_PLACEHOLDER == CMOVX && !BOOL_PLACEHOLDER : RNONE;
    E.icode == CMOVX && !e_cnd : RNONE;
    1 : E.dstE;
];
:==============================: Memory Stage :================================:
// We've feed D.valP into d_valA. Thus M.valP is not needed.
// u64 mem_data = U64_PLACEHOLDER;
u64 mem_data = M.valA;
:========================: Pipeline Register Control :=========================:
// If a branch misprediction is detected during the Execute stage, it means that
// the instruction currently in the Decode stage is invalid. Therefore, the next
// cycle’s Execute stage needs to insert a bubble.
// bool branch_mispred = BOOL_PLACEHOLDER;
bool branch_mispred = E.icode == JX && !e_cnd;

// If a RET instruction is detected in either the Decode or Execute stage, then
// the instruction in the current Fetch stage is invalid. Therefore, a bubble
// needs to be inserted in the Fetch stage for the next cycle.
//
// In fact, when E.icode == RET, the instruction in the current Decode stage is
// also invalid, but because the D.icode in the previous cycle was RET, at this
// point D.icode == NOP, so there's no need to add a condition for e_bubble.
// bool ret_harzard = RET in { U8_PLACEHOLDER, U8_PLACEHOLDER };
bool ret_harzard = RET in { D.icode, E.icode };

// Unlike in `pipe_s4a`, here we do not need to consider the case of branch
// misprediction (since in the next cycle, Fetch will always get the `f_pc`
// from `M.valP`).
// bool f_stall = BOOL_PLACEHOLDER || BOOL_PLACEHOLDER;
bool f_stall = ret_harzard || load_use_harzard;

// bool d_stall = BOOL_PLACEHOLDER;
bool d_stall = load_use_harzard;

// Unlike in `pipe_s4a`, when a branch misprediction occurs, we directly insert
// a bubble, because the instruction in the Fetch stage at that point is invalid.
// Actually, ret_harzard and d_stall cannot be true at the same time.
// bool d_bubble = BOOL_PLACEHOLDER || BOOL_PLACEHOLDER && !d_stall;
bool d_bubble = branch_mispred || ret_harzard && !d_stall;

// bool e_bubble = branch_mispred || BOOL_PLACEHOLDER;
bool e_bubble = branch_mispred || load_use_harzard;

Part C

Part C 中，我们将在 archlab-project/misc 和 archlab-project/sim/src/architectures/extra 目录下开展工作。我们的任务是修改 archlab-project/misc/ncopy.ys 和 archlab-project/sim/src/architectures/extra/ncopy.rs，让 ncopy 运行得尽可能快。最终性能将根据 cpe 和 ac 来评估。

cpe (cycles per element)：如果模拟代码需要 C 个周期来复制一个包含 N 个元素的块，那么 CPE 就是 C/N。
ac (architecture cost)：ncopy 架构的关键路径长度。形式上，CPU 架构的关键路径是时钟元件之间组合逻辑的最长路径。关键路径的长度可以用来衡量 CPU 的时钟频率，从而用于估算架构性能。在 Part C 中，关键路径的长度简化为：1 加上在架构路径中排列的硬件设备（单元）的最大数量。

ncopy 的 C 语言描述在 misc/ncopy.c 中。ncopy 函数将一个长度为 len 的整数数组 src 复制到一个不重叠的 dst 中，并返回 src 中正整数的数量。

#include <stdio.h>

typedef unsigned long long word_t;

word_t src[8], dst[8];

/* $begin ncopy */
/*
 * ncopy - copy src to dst, returning number of positive ints
 * contained in src array.
 */
word_t ncopy(word_t *src, word_t *dst, word_t len) {
  word_t count = 0;
  word_t val;

  while (len > 0) {
    val = *src++;
    *dst++ = val;
    if (val > 0)
      count++;
    len--;
  }
  return count;
}
/* $end ncopy */

int main() {
  word_t i, count;

  for (i = 0; i < 8; i++)
    src[i] = i + 1;
  count = ncopy(src, dst, 8);
  printf("count=%d\n", count);
  exit(0);
}

应用 PIPE 架构

由于 ncopy.rs 初始是 SEQ 架构，我们用 archlab-project/sim/src/architectures/builtin/pipe_std.rs 的内容完整替换 ncopy.rs 文件的内容。初步优化处理器架构，减少 ac。

添加 IOPQ 指令

查看 ncopy.ys 中的初始汇编代码。

################################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
################################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:
################################################################################
# You can modify this portion
    # Loop header
    xorq %rax,%rax      # count = 0;
    andq %rdx,%rdx      # len <= 0?
    jle Done            # if so, goto Done:

Loop:
    mrmovq (%rdi), %r10 # read val from src...
    rmmovq %r10, (%rsi) # ...and store it to dst
    andq %r10, %r10     # val <= 0?
    jle Npos            # if so, goto Npos:
    irmovq $1, %r10
    addq %r10, %rax     # count++

Npos:
    irmovq $1, %r10
    subq %r10, %rdx     # len--
    irmovq $8, %r10
    addq %r10, %rdi     # src++
    addq %r10, %rsi     # dst++
    andq %rdx,%rdx      # len > 0?
    jg Loop             # if so, goto Loop:
Done:
    ret

# In grader, we will add a "trap: jmp trap" here, which traps your program in an
# infinite loop. Thus your function should always return instead of falling
# through till the end of the source code :)

发现每次进行 count++ 时，都需要先 irmovq 将立即数 1 加载到寄存器中，然后再进行加法操作。我们可以使用 IOPQ 指令来直接将立即数与寄存器相加，从而减少指令数量和执行时间。为此，我们需要在 ncopy.rs 中添加 IOPQ 指令。具体步骤可参考 Part B 部分针对 seq_full.rs 的修改。

循环展开

至此，我们已经将架构从 SEQ 升级到了 PIPE，并通过 IOPQ 指令减少了指令数量。进一步观察发现，每一次循环最后都需要经历一次条件跳转，这意味着每次循环都要经历控制冒险，但却只处理了一个元素。我们可以通过循环展开，在一次循环中处理多个元素来进一步提升 cpe。

循环展开参考代码如下：

Loop:
    mrmovq (%rdi), %rcx
    mrmovq 8(%rdi), %rbx
    mrmovq 16(%rdi), %rbp
    mrmovq 24(%rdi), %r8
    mrmovq 32(%rdi), %r9
    mrmovq 40(%rdi), %r10
    mrmovq 48(%rdi), %r11
    mrmovq 56(%rdi), %r12
    mrmovq 64(%rdi), %r13
    mrmovq 72(%rdi), %r14
L1:
    rmmovq %rcx, (%rsi)
    andq %rcx, %rcx
    jle L2
    iaddq $1, %rax
L2:
    rmmovq %rbx, 8(%rsi)
    andq %rbx, %rbx
    jle L3
    iaddq $1, %rax
L3:
    rmmovq %rbp, 16(%rsi)
    andq %rbp, %rbp
    jle L4
    iaddq $1, %rax
L4:
    rmmovq %r8, 24(%rsi)
    andq %r8, %r8
    jle L5
    iaddq $1, %rax
L5:
    rmmovq %r9, 32(%rsi)
    andq %r9, %r9
    jle L6
    iaddq $1, %rax
L6:
    rmmovq %r10, 40(%rsi)
    andq %r10, %r10
    jle L7
    iaddq $1, %rax
L7:
    rmmovq %r11, 48(%rsi)
    andq %r11, %r11
    jle L8
    iaddq $1, %rax
L8:
    rmmovq %r12, 56(%rsi)
    andq %r12, %r12
    jle L9
    iaddq $1, %rax
L9:
    rmmovq %r13, 64(%rsi)
    andq %r13, %r13
    jle L10
    iaddq $1, %rax
L10:
    rmmovq %r14, 72(%rsi)
    andq %r14, %r14
    jle N
    iaddq $1, %rax
N:
    iaddq $80, %rdi     # src++
    iaddq $80, %rsi     # dst++
    isubq $10, %rdx     # len--
    jge Loop            # if so, goto Loop

余数二分处理

循环展开带来的问题是当 len 不是展开倍数时，剩余的元素无法通过展开后的循环处理。为了解决这个问题，我们需要针对不同的余数情况编写不同的处理代码，形成类似跳转表的结构。为了避免代码重复和过多的跳转，我们可以针对每个剩余的元素编写处理代码，Rk 对应对第 k 个剩余元素的处理，然后将这些代码块按照从大到小串联起来。对于不同的余数情况，我们只需要跳转到对应的 Rk 代码块即可，程序会顺序执行后续的对第 k-1、k-2、…、1 个剩余元素的处理代码。

余数处理参考代码如下：

R9:
    mrmovq 64(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 64(%rsi)
    jle R8
    iaddq $1, %rax
R8:
    mrmovq 56(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 56(%rsi)
    jle R7
    iaddq $1, %rax
R7:
    mrmovq 48(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 48(%rsi)
    jle R6
    iaddq $1, %rax
R6:
    mrmovq 40(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 40(%rsi)
    jle R5
    iaddq $1, %rax
R5:
    mrmovq 32(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 32(%rsi)
    jle R4
    iaddq $1, %rax
R4:
    mrmovq 24(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 24(%rsi)
    jle R3
    iaddq $1, %rax
R3:
    mrmovq 16(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 16(%rsi)
    jle R2
    iaddq $1, %rax
R2:
    mrmovq 8(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 8(%rsi)
    jle R1
    iaddq $1, %rax
R1:
    mrmovq (%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, (%rsi)
    jle Done
    iaddq $1, %rax

注意这里将 andq 指令插入到 mrmovq 和 rmmovq 之间，从而避免了流水线暂停。

至于跳转部分，我们可以对余数反复减一并判断是否为零来跳转到对应的代码块。也可以选择使用二分思想来减少跳转次数。

完整 ncopy.ys 参考代码如下：

################################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
################################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:
################################################################################
# You can modify this portion
    # Loop header
    xorq %rax,%rax      # count = 0;
    isubq $10, %rdx
    jl R

Loop:
    mrmovq (%rdi), %rcx
    mrmovq 8(%rdi), %rbx
    mrmovq 16(%rdi), %rbp
    mrmovq 24(%rdi), %r8
    mrmovq 32(%rdi), %r9
    mrmovq 40(%rdi), %r10
    mrmovq 48(%rdi), %r11
    mrmovq 56(%rdi), %r12
    mrmovq 64(%rdi), %r13
    mrmovq 72(%rdi), %r14
L1:
    rmmovq %rcx, (%rsi)
    andq %rcx, %rcx
    jle L2
    iaddq $1, %rax
L2:
    rmmovq %rbx, 8(%rsi)
    andq %rbx, %rbx
    jle L3
    iaddq $1, %rax
L3:
    rmmovq %rbp, 16(%rsi)
    andq %rbp, %rbp
    jle L4
    iaddq $1, %rax
L4:
    rmmovq %r8, 24(%rsi)
    andq %r8, %r8
    jle L5
    iaddq $1, %rax
L5:
    rmmovq %r9, 32(%rsi)
    andq %r9, %r9
    jle L6
    iaddq $1, %rax
L6:
    rmmovq %r10, 40(%rsi)
    andq %r10, %r10
    jle L7
    iaddq $1, %rax
L7:
    rmmovq %r11, 48(%rsi)
    andq %r11, %r11
    jle L8
    iaddq $1, %rax
L8:
    rmmovq %r12, 56(%rsi)
    andq %r12, %r12
    jle L9
    iaddq $1, %rax
L9:
    rmmovq %r13, 64(%rsi)
    andq %r13, %r13
    jle L10
    iaddq $1, %rax
L10:
    rmmovq %r14, 72(%rsi)
    andq %r14, %r14
    jle N
    iaddq $1, %rax
N:
    iaddq $80, %rdi     # src++
    iaddq $80, %rsi     # dst++
    isubq $10, %rdx     # len--
    jge Loop            # if so, goto Loop

R:
    iaddq $10, %rdx
    jle Done
    isubq $5, %rdx
    je R5
    jg R6_9
R1_4:
    iaddq $2, %rdx
    jl R1_2
R3_4:
    je R3
    jmp R4
R1_2:
    iaddq $1, %rdx
    je R2
    jmp R1
R6_9:
    isubq $2, %rdx
    jg R8_9
R6_7:
    je R7
    jmp R6
R8_9:
    isubq $1, %rdx
    je R8

R9:
    mrmovq 64(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 64(%rsi)
    jle R8
    iaddq $1, %rax
R8:
    mrmovq 56(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 56(%rsi)
    jle R7
    iaddq $1, %rax
R7:
    mrmovq 48(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 48(%rsi)
    jle R6
    iaddq $1, %rax
R6:
    mrmovq 40(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 40(%rsi)
    jle R5
    iaddq $1, %rax
R5:
    mrmovq 32(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 32(%rsi)
    jle R4
    iaddq $1, %rax
R4:
    mrmovq 24(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 24(%rsi)
    jle R3
    iaddq $1, %rax
R3:
    mrmovq 16(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 16(%rsi)
    jle R2
    iaddq $1, %rax
R2:
    mrmovq 8(%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, 8(%rsi)
    jle R1
    iaddq $1, %rax
R1:
    mrmovq (%rdi), %rcx
    andq %rcx, %rcx
    rmmovq %rcx, (%rsi)
    jle Done
    iaddq $1, %rax

Done:
    ret

# In grader, we will add a "trap: jmp trap" here, which traps your program in an
# infinite loop. Thus your function should always return instead of falling
# through till the end of the source code :)

减少关键路径长度

目前我们的 ncopy.rs 采用的是 PIPE 架构，关键路径长度为 4，正是执行阶段 e_cnd 的依赖路径。

回忆 dstE 需要 e_cnd 更新的情况，为条件传送 CMOVX 指令。而 CMOVX 指令并不更新条件码。由此可知，更新条件码和需要计算 e_cnd 更新 dstE 的情况不会在执行阶段同时发生。因此，我们可以在执行阶段寄存器中新增 pre_cc 记录上一条指令的条件码，用于通过 cond 单元计算 e_cnd，从而降低关键路径长度为 3。

ncopy.rs 参考修改如下：

crate::define_stages! {
    // ...
    ExecuteStage e {
        stat: Stat = Bub, icode: u8 = NOP, ifun: u8 = 0,
        valC: u64 = 0,
        valA: u64 = 0, valB: u64 = 0,
        dstE: u8 = RNONE, dstM: u8 = RNONE,
        srcA: u8 = RNONE, srcB: u8 = RNONE,
        pre_cc: ConditionCode = CC_INIT
    }
    // ...
}

sim_macro::hcl! {
// ...
:==============================: Fetch Stage :================================:
// ...
:=======================: Decode and Write Back Stage :=======================:
// ...
@set_stage(e, {
    icode: d_icode,
    ifun: d_ifun,
    stat: d_stat,
    valC: d_valC,
    srcA: d_srcA,
    srcB: d_srcB,
    valA: d_valA,
    valB: d_valB,
    dstE: d_dstE,
    dstM: d_dstM,
    pre_cc: cc,
});
:==============================: Execute Stage :===============================:
// ...
ConditionCode cc = reg_cc.cc;
u8 e_ifun = E.ifun;
ConditionCode pre_cc = E.pre_cc;

@set_input(cond, {
    cc: pre_cc,
    condfun: e_ifun,
});
// ...
:===============================: Memory Stage :===============================:
// ...
:=============================: Write Back Stage :=============================:
// ...
:========================: Pipeline Register Control :=========================:
// ...
}