Vortex Exercise 1

Exercise 1

在本练习中,将为SimX周期级模拟器添加两个新的机器性能监控(MPM)计数器,用于计算内核执行后的GPU线程束效率。这两个计数器total_issued_warps(已发射线程束总数)和total_active_threads(活跃线程总数)将通过将活跃线程数除以GPU流水线中发射执行的线程束次数,来帮助您计算线程束效率。

以下补丁直接打到 8b10348e (minor update) 分支上即可

Exercise1_8b103_cf1a4.patch

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 7ff3326b..e5fae429 100644
--- a/hw/rtl/VX_types.vh
─+++ b/hw/rtl/VX_types.vh
@@ -215,4 +215,12 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3
 
+// exercise 1
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS     12'hB03
+`define VX_CSR_MPM_TOTAL_ISSUED_WARPS_H   12'hB83
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS   12'hB04
+`define VX_CSR_MPM_TOTAL_ACTIVE_THREADS_H 12'hB84
+
+`define VX_DCR_MPM_CLASS_3              3
+
 `endif // VX_TYPES_VH
diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp
index dde0a8bf..e7d81104 100644
--- a/runtime/stub/utils.cpp
─+++ b/runtime/stub/utils.cpp
@@ -216,6 +216,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
   uint64_t mem_bank_stalls = 0;
 
   uint64_t num_cores;
+
+  // PERF: CLASS_3 exercise 1
+  uint64_t total_issued_warps = 0;
+  uint64_t total_active_threads = 0;
+
   CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
     return err;
   });
@@ -588,6 +593,39 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
         });
       }
     } break;
+
+    // exercise 1
+    case VX_DCR_MPM_CLASS_3:
+    {
+      uint64_t threads_per_warp;
+      CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+        return err;
+      });
+      // Retrieve total_issued_warps and total_active_threads for each core
+
+      // Query total_issued_warps for the core
+      uint64_t total_issued_warps_per_core;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_id, &total_issued_warps_per_core), {
+        return err;
+      });
+
+      // Query total_active_threads for the core
+      uint64_t total_active_threads_per_core;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_id, &total_active_threads_per_core), {
+        return err;
+      });
+
+      // Print total_issued_warps and total_active_threads
+      if (num_cores > 1) {
+        // Calculate and print warp efficiency
+        int warp_efficiency = calcAvgPercent(total_active_threads_per_core, total_issued_warps_per_core * threads_per_warp);
+        fprintf(stream, "PERF: core%d: Warp Efficiency=%d%%\n", core_id, warp_efficiency);
+      }
+
+      // Accumulate totals for all cores
+      total_issued_warps += total_issued_warps_per_core;
+      total_active_threads += total_active_threads_per_core;
+    } break;
     default:
       break;
     }
@@ -679,6 +717,17 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
       fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
     }
   } break;
+  // exercise 1
+  case VX_DCR_MPM_CLASS_3: {
+    uint64_t threads_per_warp;
+    CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+      return err;
+    });
+    // Calculate and print warp efficiency
+    int warp_efficiency = calcAvgPercent(total_active_threads, total_issued_warps * threads_per_warp);
+    fprintf(stream, "PERF: Warp Efficiency=%d%%\n", warp_efficiency);
+  }
+
   default:
     break;
   }
@@ -720,4 +769,4 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_l
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index e95d304c..d1074349 100644
--- a/sim/simx/core.cpp
─+++ b/sim/simx/core.cpp
@@ -229,6 +229,10 @@ void Core::schedule() {
   // advance to fetch stage
   fetch_latch_.push(trace);
   pending_instrs_.push_back(trace);
+
+  // track active threads   exercise 1
+  perf_stats_.total_issued_warps += 1;
+  perf_stats_.total_active_threads += trace->tmask.count();
 }
 
 void Core::fetch() {
@@ -479,4 +483,4 @@ void Core::set_satp(uint64_t satp) {
   emulator_.set_satp(satp); //JAEWON wit, tid???
   // emulator_.set_csr(VX_CSR_SATP,satp,0,0); //JAEWON wit, tid???
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/sim/simx/core.h b/sim/simx/core.h
index a8b674d0..fc9a3963 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -69,6 +69,9 @@ public:
     uint64_t stores;
     uint64_t ifetch_latency;
     uint64_t load_latency;
+    // exercise 1
+    uint64_t total_issued_warps;
+    uint64_t total_active_threads;
 
     PerfStats()
       : cycles(0)
@@ -96,6 +99,9 @@ public:
       , stores(0)
       , ifetch_latency(0)
       , load_latency(0)
+      // exercise 1
+      , total_issued_warps(0)
+      , total_active_threads(0)
     {}
   };
 
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 3eb62f9c..3d6c4024 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -553,6 +553,14 @@ Word Emulator::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) {
         CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
         }
       } break;
+      // exercise 1
+      case VX_DCR_MPM_CLASS_3: {
+        // Add your custom counters here for Class 3:
+        switch (addr) {
+          CSR_READ_64(VX_CSR_MPM_TOTAL_ISSUED_WARPS, core_perf.total_issued_warps);
+          CSR_READ_64(VX_CSR_MPM_TOTAL_ACTIVE_THREADS, core_perf.total_active_threads);
+        }
+      } break;
       default:
         std::cerr << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
         std::abort();

解析

simx 的运行流程大概是这样的:

  1. main.cpp => RT_CHECK(vx_start(device, krnl_buffer, args_buffer));

  2. vx_start (runtime/stub/vortex.cpp ) => callbacks->start ( runtime/common/callbacks.inc )

  3. start (vortex/runtime/simx/vortex.cpp) => processor_.run()

  4. Processor::run() => ProcessorImpl::run() (vortex/sim/simx/processor.cpp)

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    
        int ProcessorImpl::run() {
        SimPlatform::instance().reset();
        this->reset();
    
        bool done;
        int exitcode = 0;
        do {
            SimPlatform::instance().tick();
            done = true;
            for (auto cluster : clusters_) {
            if (cluster->running()) {
                done = false;
                continue;
            }
            exitcode |= cluster->get_exitcode();
            }
            perf_mem_latency_ += perf_mem_pending_reads_;
        } while (!done);
    
        return exitcode;
        }
    
  5. SimPlatform 里的方法的具体实现

    • processor (vortex/sim/simx/processor.cpp) 中实例化 clusters_
    1
    2
    3
    4
    
        // create clusters
        for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
            clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
        }
    
    • 在 clusters (vortex/sim/simx/cluster.cpp) 中实例化 sockets_

    • 在 sockets (vortex/sim/simx/socket.cpp) 中实例化 cores_

    • 在core ( vortex/sim/simx/core.cpp ) 中实例化 SimObject实现 SimPlatform 的底层仿真实现
      例如:

       1
       2
       3
       4
       5
       6
       7
       8
       9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      
              void Core::tick() {
              this->commit();
              this->execute();
              this->issue();
              this->decode();
              this->fetch();
              this->schedule();
      
              ++perf_stats_.cycles;
              DPN(2, std::flush);
              }
      
              void Core::schedule() {
              auto trace = emulator_.step();
              if (trace == nullptr) {
                  ++perf_stats_.sched_idle;
                  return;
              }
      
              // suspend warp until decode
              emulator_.suspend(trace->wid);
      
              DT(3, "pipeline-schedule: " << *trace);
      
              // advance to fetch stage
              fetch_latch_.push(trace);
              pending_instrs_.push_back(trace);
      
              // track active threads   exercise 1
              perf_stats_.total_issued_warps += 1;
              perf_stats_.total_active_threads += trace->tmask.count();
              }
      

解题步骤

  • 找到实现计算性能统计的地方

    每次 vx_dev_close (vortex/runtime/stub/vortex.cpp) 都会执行 vx_dump_perf 这个函数已经实现了统计 VX_DCR_MPM_CLASS_CORE 和 VX_DCR_MPM_CLASS_MEM 的功能,
    分析vx_dump_perf 可知, 只需要在仿真时传入命令行参数 –perf=xxx, 则xxx 就会赋值给 perf_class , 从而计算对应的MPM_CLASS
    我们只需要添加一个 VX_DCR_MPM_CLASS_3 分支 来实现用于计算内核执行后的GPU线程束效率即可

  • 添加定义获取性能统计变量的CSR 地址

    按照 Exercise1_8b103_cf1a4.patch 的 modified: hw/rtl/VX_types.vh 修改

    1
    2
    3
    4
    5
    6
    7
    
    // exercise 1
    `define VX_CSR_MPM_TOTAL_ISSUED_WARPS     12'hB03
    `define VX_CSR_MPM_TOTAL_ISSUED_WARPS_H   12'hB83
    `define VX_CSR_MPM_TOTAL_ACTIVE_THREADS   12'hB04
    `define VX_CSR_MPM_TOTAL_ACTIVE_THREADS_H 12'hB84
    
    `define VX_DCR_MPM_CLASS_3              3
    

    已经有了

    1
    2
    3
    
    `define VX_DCR_MPM_CLASS_NONE           0
    `define VX_DCR_MPM_CLASS_CORE           1
    `define VX_DCR_MPM_CLASS_MEM            2
    

    所以我们添加了一个

    1
    
    `define VX_DCR_MPM_CLASS_3              3
    

    由 vortex/sim/simx/emulator.cpp 的 get_csr 函数中可知

    1
    2
    
        if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
         || (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
    

    新定义的寄存器地址区间应该在 VX_CSR_MPM_BASE ~ (VX_CSR_MPM_BASE + 32) 和 VX_CSR_MPM_BASE_H ~ (VX_CSR_MPM_BASE_H + 32) 之间

  • 找到统计性能的相关结构体并添加统计变量

    搜索代码可以知道统计性能相关的结构体都在 PerfStats 里
    按照 Exercise1_8b103_cf1a4.patch 的 modified: sim/simx/core.h 修改

  • 添加统计性能的代码

    由上面的分析可知, 每一个仿真步骤的最后都会运行 schedule 函数, 所以我们直接在 schedule 函数里面添加统计代码
    按照 Exercise1_8b103_cf1a4.patch 的 modified: sim/simx/core.cpp 修改

  • 完善 vx_dump_perf 函数

    仿照 VX_DCR_MPM_CLASS_CORE 和 VX_DCR_MPM_CLASS_MEM 分支, 添加 case VX_DCR_MPM_CLASS_3 分支, 用于获取每个核的
    total_issued_warps_per_core 和 total_active_threads_per_core 并计算每个核的warp_efficiency
    把每个核的数据汇总成 total_active_threads 和 total_issued_warps 再计算总的效率
    参考 Exercise1_8b103_cf1a4.patch 的 modified: runtime/stub/utils.cpp

  • 完善get_csr 函数, 使得可以通过 csrr 命令访问定义的变量

    按照 Exercise1_8b103_cf1a4.patch 的 modified: sim/simx/emulator.cpp 修改
    在Exercise1 中, 内核运行结束后会调用 vx_perf_dump 通过 csr_read 即 csrr 命令把所有的 CSRS 写入到内存中, 在main 函数的 close_device 时就可以
    调用 vx_mpm_query 通过 download 方法直接从内存读取对应的变量值了, 所以必须实现 get_csr 函数中获取 total_issued_warps_per_core 和 total_active_threads_per_core 的功能

调试技巧

如果 runtime/simx 目录下的代码有改动, 需要先进入 runtime/simx 执行 make clean 再 make 重新生成运行时的库
看日志文件可以使用 lnav, 看起来效率高一些

运行测试

1
2
3
4
5
mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools   # 构建 build 目录, 生成 makefile 等文件,把必要的文件拷贝到build目录
make -j32 -s   # 这个过程会比较久, 需要耐心等待
./ci/blackbox.sh --cores=4 --app=demo --driver=simx --perf=3 --args="-n128"
Licensed under CC BY-NC-SA 4.0