1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
|
diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv
index 7a48eb63..0266ae55 100644
--- a/hw/rtl/VX_gpu_pkg.sv
─+++ b/hw/rtl/VX_gpu_pkg.sv
@@ -708,6 +708,9 @@ package VX_gpu_pkg;
typedef struct packed {
logic [PERF_CTR_BITS-1:0] idles;
logic [PERF_CTR_BITS-1:0] stalls;
+ // exercise 2
+ logic [PERF_CTR_BITS-1:0] total_issued_warps;
+ logic [PERF_CTR_BITS-1:0] total_active_threads;
} sched_perf_t;
typedef struct packed {
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 7ff3326b..6f515760 100644
--- a/hw/rtl/VX_types.vh
─+++ b/hw/rtl/VX_types.vh
@@ -215,4 +215,12 @@
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
+// exercise 1, 2
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS 12'hB03
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS_H 12'hB83
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS 12'hB04
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS_H 12'hB84
+
+`define VX_DCR_MPM_CLASS_3 3
+
`endif // VX_TYPES_VH
diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv
index 96c29736..5a48d6cc 100644
--- a/hw/rtl/core/VX_csr_data.sv
─+++ b/hw/rtl/core/VX_csr_data.sv
@@ -277,6 +277,15 @@ import VX_fpu_pkg::*;
default:;
endcase
end
+ // exercise 2
+ `VX_DCR_MPM_CLASS_3: begin
+ case (read_addr)
+ // Add your custom counters here for Class 3:
+ `CSR_READ_64(`VX_CSR_MPM_TOTAL_ISSUED_WARPS, read_data_ro_w, pipeline_perf.sched.total_issued_warps);
+ `CSR_READ_64(`VX_CSR_MPM_TOTAL_ACTIVE_THREADS, read_data_ro_w, pipeline_perf.sched.total_active_threads);
+ default:;
+ endcase
+ end
default:;
endcase
`endif
diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv
index a89a0b4d..60a10b94 100644
--- a/hw/rtl/core/VX_schedule.sv
+++ b/hw/rtl/core/VX_schedule.sv
@@ -417,6 +417,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [PERF_CTR_BITS-1:0] perf_sched_idles;
reg [PERF_CTR_BITS-1:0] perf_sched_stalls;
+ // exercise 2
+ reg [PERF_CTR_BITS-1:0] perf_total_issued_warps;
+ reg [PERF_CTR_BITS-1:0] perf_total_active_threads;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
@@ -425,14 +428,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
if (reset) begin
perf_sched_idles <= '0;
perf_sched_stalls <= '0;
+ // exercise 2
+ perf_total_issued_warps <= 0;
+ perf_total_active_threads <= 0;
end else begin
perf_sched_idles <= perf_sched_idles + PERF_CTR_BITS'(schedule_idle);
perf_sched_stalls <= perf_sched_stalls + PERF_CTR_BITS'(schedule_stall);
+ // exercise 2
+ if (schedule_if_fire) begin
+ perf_total_issued_warps <= perf_total_issued_warps + 1;
+ perf_total_active_threads <= perf_total_active_threads + $countones(schedule_if.data.tmask); // $countones 计算活跃线程的个数
+ end
end
end
assign sched_perf.idles = perf_sched_idles;
assign sched_perf.stalls = perf_sched_stalls;
+ // exercise 2
+ assign sched_perf.total_issued_warps = perf_total_issued_warps;
+ assign sched_perf.total_active_threads = perf_total_active_threads;
`endif
`ifdef DBG_TRACE_PIPELINE
diff --git a/runtime/common/callbacks.inc b/runtime/common/callbacks.inc
index 84a77718..5230d986 100644
--- a/runtime/common/callbacks.inc
+++ b/runtime/common/callbacks.inc
@@ -20,7 +20,7 @@ struct vx_buffer {
extern int vx_dev_init(callbacks_t* callbacks) {
if (nullptr == callbacks)
return -1;
-
+ // lambda function
callbacks->dev_open = [](vx_device_h* hdevice)->int {
if (nullptr == hdevice)
return -1;
diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h
index 6e3bda07..736225b2 100644
--- a/runtime/include/vortex.h
+++ b/runtime/include/vortex.h
@@ -26,7 +26,7 @@ extern "C" {
typedef void* vx_device_h;
typedef void* vx_buffer_h;
-// device caps ids
+// device caps ids (Device Capabilities Identifiers)
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_NUM_THREADS 0x1
#define VX_CAPS_NUM_WARPS 0x2
diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp
index ccf61e16..85e2671a 100644
--- a/runtime/rtlsim/vortex.cpp
+++ b/runtime/rtlsim/vortex.cpp
@@ -184,6 +184,7 @@ public:
return 0;
}
+ // 要确保每次只跑一个仿真
int start(uint64_t krnl_addr, uint64_t args_addr) {
// ensure prior run completed
if (future_.valid()) {
diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp
index dde0a8bf..1ccab2fb 100644
--- a/runtime/stub/utils.cpp
+++ b/runtime/stub/utils.cpp
@@ -216,6 +216,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t mem_bank_stalls = 0;
uint64_t num_cores;
+
+ // PERF: CLASS_3 exercise 1, 2
+ uint64_t total_issued_warps = 0;
+ uint64_t total_active_threads = 0;
+
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
return err;
});
@@ -588,6 +593,39 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
});
}
} break;
+
+ // exercise 1, 2
+ case VX_DCR_MPM_CLASS_3:
+ {
+ uint64_t threads_per_warp;
+ CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+ return err;
+ });
+ // Retrieve total_issued_warps and total_active_threads for each core
+
+ // Query total_issued_warps for the core
+ uint64_t total_issued_warps_per_core;
+ CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_id, &total_issued_warps_per_core), {
+ return err;
+ });
+
+ // Query total_active_threads for the core
+ uint64_t total_active_threads_per_core;
+ CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_id, &total_active_threads_per_core), {
+ return err;
+ });
+
+ // Print total_issued_warps and total_active_threads
+ if (num_cores > 1) {
+ // Calculate and print warp efficiency
+ int warp_efficiency = calcAvgPercent(total_active_threads_per_core, total_issued_warps_per_core * threads_per_warp);
+ fprintf(stream, "PERF: core%d: Warp Efficiency=%d%%\n", core_id, warp_efficiency);
+ }
+
+ // Accumulate totals for all cores
+ total_issued_warps += total_issued_warps_per_core;
+ total_active_threads += total_active_threads_per_core;
+ } break;
default:
break;
}
@@ -679,6 +717,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
}
} break;
+ // exercise 1, 2
+ case VX_DCR_MPM_CLASS_3: {
+ uint64_t threads_per_warp;
+ CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+ return err;
+ });
+ // Calculate and print warp efficiency
+ int warp_efficiency = calcAvgPercent(total_active_threads, total_issued_warps * threads_per_warp);
+ fprintf(stream, "PERF: Warp Efficiency=%d%%\n", warp_efficiency);
+ // exercise 1, 2
+ fprintf(stream, "PERF: total_active_threads: %ld, total_issued_warps: %ld, threads_per_warp: %ld\n", total_active_threads, total_issued_warps, threads_per_warp);
+
+ }
+
default:
break;
}
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index e95d304c..d1074349 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -229,6 +229,10 @@ void Core::schedule() {
// advance to fetch stage
fetch_latch_.push(trace);
pending_instrs_.push_back(trace);
+
+ // track active threads exercise 1
+ perf_stats_.total_issued_warps += 1;
+ perf_stats_.total_active_threads += trace->tmask.count();
}
void Core::fetch() {
diff --git a/sim/simx/core.h b/sim/simx/core.h
index a8b674d0..fc9a3963 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -69,6 +69,9 @@ public:
uint64_t stores;
uint64_t ifetch_latency;
uint64_t load_latency;
+ // exercise 1
+ uint64_t total_issued_warps;
+ uint64_t total_active_threads;
PerfStats()
: cycles(0)
@@ -96,6 +99,9 @@ public:
, stores(0)
, ifetch_latency(0)
, load_latency(0)
+ // exercise 1
+ , total_issued_warps(0)
+ , total_active_threads(0)
{}
};
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 3eb62f9c..3d6c4024 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -553,6 +553,14 @@ Word Emulator::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) {
CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
}
} break;
+ // exercise 1
+ case VX_DCR_MPM_CLASS_3: {
+ switch (addr) {
+ // Add your custom counters here for Class 3:
+ CSR_READ_64(VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_perf.total_issued_warps);
+ CSR_READ_64(VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_perf.total_active_threads);
+ }
+ } break;
default:
std::cerr << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp
index f7afbac4..cb6dda8b 100644
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@@ -136,6 +136,11 @@ int main(int argc, char *argv[]) {
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(TYPE);
+ std::cout << "count: " << count << std::endl;
+ std::cout << "num_cores: " << num_cores << std::endl;
+ std::cout << "num_warps: " << num_warps << std::endl;
+ std::cout << "num_threads : " << num_threads << std::endl;
+
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@@ -178,11 +183,11 @@ int main(int argc, char *argv[]) {
// Upload kernel binary
std::cout << "Upload kernel binary" << std::endl;
- RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+ RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); // 把 kernel 的有效二进制( 前两个8 字节虚拟地址信息除外 )拷贝到 krnl_buffer
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
- RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+ RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); // 把 kernel_arg 拷贝到 args_buffer
// start device
std::cout << "start device" << std::endl;
|