Vortex Exercise 2

本次练习中，需要为GPU硬件添加两个新的机器性能监控（MPM）计数器，用于计算内核执行后的线程束效率。通过这两个计数器——total_issued_warps（线程束发射总数）和total_active_threads（活跃线程总数），您可以用活跃线程数除以GPU流水线中线程束的发射次数来计算线程束效率。

以下补丁直接打到 8b10348e (minor update) 分支上即可

Exercise2_8b103_7625c.patch

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294


diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv
index 7a48eb63..0266ae55 100644
--- a/hw/rtl/VX_gpu_pkg.sv
─+++ b/hw/rtl/VX_gpu_pkg.sv
@@ -708,6 +708,9 @@ package VX_gpu_pkg;
     typedef struct packed {
         logic [PERF_CTR_BITS-1:0] idles;
         logic [PERF_CTR_BITS-1:0] stalls;
+        // exercise 2
+        logic [PERF_CTR_BITS-1:0] total_issued_warps;
+        logic [PERF_CTR_BITS-1:0] total_active_threads;
     } sched_perf_t;
 
     typedef struct packed {
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 7ff3326b..6f515760 100644
--- a/hw/rtl/VX_types.vh
─+++ b/hw/rtl/VX_types.vh
@@ -215,4 +215,12 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3
 
+// exercise 1, 2
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS     12'hB03
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS_H   12'hB83
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS   12'hB04
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS_H 12'hB84
+
+`define VX_DCR_MPM_CLASS_3              3
+
 `endif // VX_TYPES_VH
diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv
index 96c29736..5a48d6cc 100644
--- a/hw/rtl/core/VX_csr_data.sv
─+++ b/hw/rtl/core/VX_csr_data.sv
@@ -277,6 +277,15 @@ import VX_fpu_pkg::*;
                         default:;
                         endcase
                     end
+                    // exercise 2
+                    `VX_DCR_MPM_CLASS_3: begin
+                        case (read_addr)
+                        // Add your custom counters here for Class 3:
+                        `CSR_READ_64(`VX_CSR_MPM_TOTAL_ISSUED_WARPS, read_data_ro_w, pipeline_perf.sched.total_issued_warps);
+                        `CSR_READ_64(`VX_CSR_MPM_TOTAL_ACTIVE_THREADS, read_data_ro_w, pipeline_perf.sched.total_active_threads);
+                        default:;
+                        endcase
+                    end
                     default:;
                     endcase
                 `endif
diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv
index a89a0b4d..60a10b94 100644
--- a/hw/rtl/core/VX_schedule.sv
+++ b/hw/rtl/core/VX_schedule.sv
@@ -417,6 +417,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
 `ifdef PERF_ENABLE
     reg [PERF_CTR_BITS-1:0] perf_sched_idles;
     reg [PERF_CTR_BITS-1:0] perf_sched_stalls;
+    // exercise 2
+    reg [PERF_CTR_BITS-1:0] perf_total_issued_warps;
+    reg [PERF_CTR_BITS-1:0] perf_total_active_threads;
 
     wire schedule_idle = ~schedule_valid;
     wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
@@ -425,14 +428,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
         if (reset) begin
             perf_sched_idles  <= '0;
             perf_sched_stalls <= '0;
+            // exercise 2
+            perf_total_issued_warps <= 0;
+            perf_total_active_threads <= 0;
         end else begin
             perf_sched_idles  <= perf_sched_idles + PERF_CTR_BITS'(schedule_idle);
             perf_sched_stalls <= perf_sched_stalls + PERF_CTR_BITS'(schedule_stall);
+            // exercise 2
+            if (schedule_if_fire) begin
+            perf_total_issued_warps <= perf_total_issued_warps + 1;
+            perf_total_active_threads <= perf_total_active_threads + $countones(schedule_if.data.tmask);  // $countones 计算活跃线程的个数
+            end
         end
     end
 
     assign sched_perf.idles = perf_sched_idles;
     assign sched_perf.stalls = perf_sched_stalls;
+    // exercise 2
+    assign sched_perf.total_issued_warps = perf_total_issued_warps;
+    assign sched_perf.total_active_threads = perf_total_active_threads;
 `endif
 
 `ifdef DBG_TRACE_PIPELINE
diff --git a/runtime/common/callbacks.inc b/runtime/common/callbacks.inc
index 84a77718..5230d986 100644
--- a/runtime/common/callbacks.inc
+++ b/runtime/common/callbacks.inc
@@ -20,7 +20,7 @@ struct vx_buffer {
 extern int vx_dev_init(callbacks_t* callbacks) {
   if (nullptr == callbacks)
     return -1;
-
+  // lambda function
   callbacks->dev_open = [](vx_device_h* hdevice)->int {
     if (nullptr == hdevice)
       return  -1;
diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h
index 6e3bda07..736225b2 100644
--- a/runtime/include/vortex.h
+++ b/runtime/include/vortex.h
@@ -26,7 +26,7 @@ extern "C" {
 typedef void* vx_device_h;
 typedef void* vx_buffer_h;
 
-// device caps ids
+// device caps ids (Device Capabilities Identifiers)
 #define VX_CAPS_VERSION             0x0
 #define VX_CAPS_NUM_THREADS         0x1
 #define VX_CAPS_NUM_WARPS           0x2
diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp
index ccf61e16..85e2671a 100644
--- a/runtime/rtlsim/vortex.cpp
+++ b/runtime/rtlsim/vortex.cpp
@@ -184,6 +184,7 @@ public:
     return 0;
   }
 
+  // 要确保每次只跑一个仿真
   int start(uint64_t krnl_addr, uint64_t args_addr) {
     // ensure prior run completed
     if (future_.valid()) {
diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp
index dde0a8bf..1ccab2fb 100644
--- a/runtime/stub/utils.cpp
+++ b/runtime/stub/utils.cpp
@@ -216,6 +216,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
   uint64_t mem_bank_stalls = 0;
 
   uint64_t num_cores;
+
+  // PERF: CLASS_3 exercise 1, 2
+  uint64_t total_issued_warps = 0;
+  uint64_t total_active_threads = 0;
+
   CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
     return err;
   });
@@ -588,6 +593,39 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
         });
       }
     } break;
+
+    // exercise 1, 2
+    case VX_DCR_MPM_CLASS_3:
+    {
+      uint64_t threads_per_warp;
+      CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+        return err;
+      });
+      // Retrieve total_issued_warps and total_active_threads for each core
+
+      // Query total_issued_warps for the core
+      uint64_t total_issued_warps_per_core;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_id, &total_issued_warps_per_core), {
+        return err;
+      });
+
+      // Query total_active_threads for the core
+      uint64_t total_active_threads_per_core;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_id, &total_active_threads_per_core), {
+        return err;
+      });
+
+      // Print total_issued_warps and total_active_threads
+      if (num_cores > 1) {
+        // Calculate and print warp efficiency
+        int warp_efficiency = calcAvgPercent(total_active_threads_per_core, total_issued_warps_per_core * threads_per_warp);
+        fprintf(stream, "PERF: core%d: Warp Efficiency=%d%%\n", core_id, warp_efficiency);
+      }
+
+      // Accumulate totals for all cores
+      total_issued_warps += total_issued_warps_per_core;
+      total_active_threads += total_active_threads_per_core;
+    } break;
     default:
       break;
     }
@@ -679,6 +717,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
       fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
     }
   } break;
+  // exercise 1, 2
+  case VX_DCR_MPM_CLASS_3: {
+    uint64_t threads_per_warp;
+    CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+      return err;
+    });
+    // Calculate and print warp efficiency
+    int warp_efficiency = calcAvgPercent(total_active_threads, total_issued_warps * threads_per_warp);
+    fprintf(stream, "PERF: Warp Efficiency=%d%%\n", warp_efficiency);
+    // exercise 1, 2
+    fprintf(stream, "PERF: total_active_threads: %ld, total_issued_warps: %ld, threads_per_warp: %ld\n", total_active_threads, total_issued_warps, threads_per_warp);
+
+  }
+
   default:
     break;
   }
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index e95d304c..d1074349 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -229,6 +229,10 @@ void Core::schedule() {
   // advance to fetch stage
   fetch_latch_.push(trace);
   pending_instrs_.push_back(trace);
+
+  // track active threads   exercise 1
+  perf_stats_.total_issued_warps += 1;
+  perf_stats_.total_active_threads += trace->tmask.count();
 }
 
 void Core::fetch() {
diff --git a/sim/simx/core.h b/sim/simx/core.h
index a8b674d0..fc9a3963 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -69,6 +69,9 @@ public:
     uint64_t stores;
     uint64_t ifetch_latency;
     uint64_t load_latency;
+    // exercise 1
+    uint64_t total_issued_warps;
+    uint64_t total_active_threads;
 
     PerfStats()
       : cycles(0)
@@ -96,6 +99,9 @@ public:
       , stores(0)
       , ifetch_latency(0)
       , load_latency(0)
+      // exercise 1
+      , total_issued_warps(0)
+      , total_active_threads(0)
     {}
   };
 
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 3eb62f9c..3d6c4024 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -553,6 +553,14 @@ Word Emulator::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) {
         CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
         }
       } break;
+      // exercise 1
+      case VX_DCR_MPM_CLASS_3: {
+        switch (addr) {
+        // Add your custom counters here for Class 3:
+          CSR_READ_64(VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_perf.total_issued_warps);
+          CSR_READ_64(VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_perf.total_active_threads);
+        }
+      } break;
       default:
         std::cerr << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
         std::abort();
diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp
index f7afbac4..cb6dda8b 100644
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@@ -136,6 +136,11 @@ int main(int argc, char *argv[]) {
   uint32_t num_points = count * total_threads;
   uint32_t buf_size = num_points * sizeof(TYPE);
 
+  std::cout << "count: " << count << std::endl;
+  std::cout << "num_cores: " << num_cores << std::endl;
+  std::cout << "num_warps: " << num_warps << std::endl;
+  std::cout << "num_threads : " << num_threads << std::endl;
+
   std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
   std::cout << "number of points: " << num_points << std::endl;
   std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@@ -178,11 +183,11 @@ int main(int argc, char *argv[]) {
 
   // Upload kernel binary
   std::cout << "Upload kernel binary" << std::endl;
-  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));  // 把 kernel 的有效二进制( 前两个8 字节虚拟地址信息除外 )拷贝到 krnl_buffer
 
   // upload kernel argument
   std::cout << "upload kernel argument" << std::endl;
-  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));  // 把 kernel_arg 拷贝到 args_buffer
 
   // start device
   std::cout << "start device" << std::endl;

解析

rtlsim 的运行流程

main.cpp => RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
vx_start (runtime/stub/vortex.cpp ) => callbacks->start ( runtime/common/callbacks.inc )
start (vortex/runtime/rtlsim/vortex.cpp) => processor_.run()

Processor::run() => Processor::Impl::run() (vortex/sim/rtlsim/processor.cpp)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


    void run() {
    #ifndef NDEBUG
        std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
    #endif

        // reset device
        this->reset();

        // start
        device_->reset = 0;
        for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
        device_->mem_req_ready[b] = 1;
        }

        // wait on device to go busy
        while (!device_->busy) {
        this->tick();
        }

        // wait on device to go idle
        while (device_->busy) {
        this->tick();
        }

        // stop
        device_->reset = 1;

        this->cout_flush();
    }

这就是使用 verilator 仿真的验证代码, 代码很简单, 复位后一直调用 tick 函数推进仿真即可

解题步骤析

添加统计线程束效率的寄存器

按照 Exercise2_8b103_7625c.patch 的 modified: hw/rtl/VX_gpu_pkg.sv 修改

添加获取 total_issued_warps 和 total_active_threads 的代码

按照 Exercise2_8b103_7625c.patch 的modified: hw/rtl/core/VX_csr_data.sv 修改

根据6 级流水线, 我们在 schedule 阶段统计更新线程束效率寄存器

按照 Exercise2_8b103_7625c.patch 的modified: hw/rtl/core/VX_schedule.sv 修改

调试技巧

如果 runtime/rtlsim 目录下的代码有改动, 需要先进入 runtime/rtlsim 执行 make clean 再 make 重新生成运行时的库
看日志文件可以使用 lnav, 看起来效率高一些

运行测试时由命令行传入count

1
2
3
4
5


mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools   # 构建 build 目录， 生成 makefile 等文件，把必要的文件拷贝到build目录
make -j32 -s   # 这个过程会比较久， 需要耐心等待
./ci/blackbox.sh --cores=4 --app=demo --driver=rtlsim --perf=3 --args="-n64"

生成波形的命令

1
2


cd build
./ci/blackbox.sh --cores=4 --app=demo --driver=rtlsim --perf=3 --args="-n64" --debug=1

这会在 build 目录生成一个 trace.vcd 的波形文件, 可以用 gtkwave 打开

vortex rtlsim 默认会排除 libs 文件夹下的库模块以减少波形文件大小

要启用完整追踪，可以使用：

1

CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1