Performance Counters
To measure runtime performance, multiple performance monitoring counters (PMCs) are available.
There are on-core PMCs, which are configured via the DPU kernel program. These PMCs allow for the monitoring of clock cycles spent and instruction counts during execution. Additionally, there are off-core PMCs available for the host program, which can monitor bank interface activities related to MRAM. The bank interface is the layer that interconnect the DRAM with the DPU.
On DPU v1A hardware, there is a single counter on the DPU and no bank interface counters. On DPU v1B hardware, there are two counters on the DPU and two bank interface counters.
The following sections describe how to use these PMCs.
DPU PMC
The runtime environment provides functions to program the hardware performance counter defined by DPUs:
perfcounter_config: configures the performance counter to measure:
COUNT_CYCLES: count the elapsing clock cycles, to get an accurate execution time
COUNT_INSTRUCTIONS: count the elapsed instructions, to get an accurate workload estimation
COUNT_SAME: apply the same counter as the last one used
COUNT_ENABLE_BOTH: count both clock cycles and number of instructions executed (v1B only)
COUNT_DISABLE_BOTH: disable both counters (v1B only)
perfcounter_get: returns the current counter value
perfcounter_get_both: returns the values of both counters (v1B only)
The main difference between counting cycles and instructions is that cycles include the execution time of instructions AND the memory transfers.
Please note that when using the UPMEM simulator, the performance counter only provides a reliable number of instructions. One should not rely on the provided number of cycles.
perfcounter_config may reset the counter (if the second parameter is true) or keep the current counter value, in
which case this initial value is returned by the function.
In other words, one may reset the counter and count cycles or instructions when reaching another point of time:
(void) perfcounter_config(COUNT_CYCLES, true);
...
perfcounter_t run_time = perfcounter_get();
Or choose to checkpoint two parts of the code and compute the delta between the two:
perfcounter_t initial_time = perfcounter_config(COUNT_CYCLES, false);
...
perfcounter_t duration = perfcounter_get() - initial_time;
However, whatever your choice is, you have to carefully manage counter overflows, since the hardware counter is 36-bit wide. Also, remember that the counter precision is 16 cycles (or instructions).
The DPU cycle count can be converted to time (seconds) using the variable CLOCKS_PER_SEC.
The variable is available on the DPU side, and can also be retrieved on the host side through a copy.
Below is a simple code example where the host measures the execution time of a dummy DPU program running a loop.
#include <perfcounter.h>
#include <stdio.h>
__host uint32_t nb_cycles;
int main() {
perfcounter_config(COUNT_CYCLES, true);
int loop = 1e7;
while (loop)
loop--;
nb_cycles = perfcounter_get();
return 0;
}
#include <dpu.h>
#include <stdio.h>
#include <time.h>
#ifndef DPU_BINARY
#define DPU_BINARY "./frequency_example"
#endif
static inline double my_clock(void) {
struct timespec t;
clock_gettime(CLOCK_MONOTONIC_RAW, &t);
return (1.0e-9 * t.tv_nsec + t.tv_sec);
}
int main() {
struct dpu_set_t set, dpu;
DPU_ASSERT(dpu_alloc(1, NULL, &set));
printf("DPU allocated\n");
DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
double start = my_clock();
DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
double end = my_clock();
// retrieve number of cycles on DPU
uint32_t nb_cycles;
DPU_FOREACH(set, dpu) {
DPU_ASSERT(
dpu_copy_from(dpu, "nb_cycles", 0, &nb_cycles, sizeof(uint32_t)));
}
// retrieve DPU frequency
uint32_t clocks_per_sec;
DPU_FOREACH(set, dpu) {
DPU_ASSERT(dpu_copy_from(dpu, "CLOCKS_PER_SEC", 0, &clocks_per_sec,
sizeof(uint32_t)));
}
printf("DPU cycles: %u\n", nb_cycles);
printf("DPU time: %.2e secs.\n", (double)nb_cycles / clocks_per_sec);
printf("Host elapsed time: %.2e secs.\n", end - start);
DPU_ASSERT(dpu_free(set));
return 0;
}
#include <chrono>
#include <dpu>
#include <iostream>
#ifndef DPU_BINARY
#define DPU_BINARY "./frequency_example"
#endif
using namespace dpu;
int main() {
try {
auto dpu = DpuSet::allocate(1);
std::cout << "DPU allocated" << std::endl;
dpu.load(DPU_BINARY);
std::chrono::steady_clock::time_point begin =
std::chrono::steady_clock::now();
dpu.exec();
std::chrono::steady_clock::time_point end =
std::chrono::steady_clock::now();
// retrieve number of cycles on DPU
std::vector<std::vector<uint32_t>> nbCycles(1);
nbCycles.front().resize(1);
dpu.copy(nbCycles, "nb_cycles");
// retrieve DPU frequency
std::vector<std::vector<uint32_t>> clocksPerSec(1);
clocksPerSec.front().resize(1);
dpu.copy(clocksPerSec, "CLOCKS_PER_SEC");
std::cout.precision(2);
std::cout << std::scientific << "DPU cycles: " << nbCycles.front().front()
<< std::endl
<< "DPU time: "
<< (double)nbCycles.front().front() / clocksPerSec.front().front()
<< " secs." << std::endl;
std::cout << "Host elapsed time: "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end -
begin)
.count() /
1.0e9
<< " secs." << std::endl;
} catch (const DpuError &e) {
std::cerr << e.what() << std::endl;
}
return 0;
}
import com.upmem.dpu.Dpu;
import com.upmem.dpu.DpuException;
import com.upmem.dpu.DpuSystem;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
public class FrequencyExampleHost {
public static void main(String[] args) throws DpuException {
try (DpuSystem dpu = DpuSystem.allocate(1, "")) {
dpu.load("frequency_example");
System.out.println("DPU allocated");
long begin = System.nanoTime();
dpu.exec();
long end = System.nanoTime();
byte[][] nbCycles = new byte[1][4];
dpu.copy(nbCycles, "nb_cycles");
ByteBuffer nbCyclesWrapped = ByteBuffer.wrap(nbCycles[0]);
nbCyclesWrapped.order(ByteOrder.LITTLE_ENDIAN);
byte[][] clocksPerSec = new byte[1][4];
dpu.copy(clocksPerSec, "CLOCKS_PER_SEC");
ByteBuffer clocksWrapped = ByteBuffer.wrap(clocksPerSec[0]);
clocksWrapped.order(ByteOrder.LITTLE_ENDIAN);
System.out.println("DPU cycles: " + nbCyclesWrapped.getInt(0));
System.out.println(
"DPU time: " +
String.format("%.2e", (double)nbCyclesWrapped.getInt(0) /
clocksWrapped.getInt(0)) +
" secs.");
System.out.println("Host elapsed time: " +
String.format("%.2e", ((end - begin) / 1.0e9)) +
" secs.");
}
}
}
#!/ usr / bin / env python3
import time
from dpu import DpuSet
from dpu import ALLOCATE_ALL
with DpuSet(1, binary = "frequency_example") as dpus :
print("DPU allocated")
start = time.monotonic()
dpus.exec()
end = time.monotonic()
nbCycles =[bytearray(4) for _ in dpus]
dpus.copy(nbCycles, 'nb_cycles')
clocks_per_sec =[bytearray(4) for _ in dpus]
dpus.copy(clocks_per_sec, 'CLOCKS_PER_SEC')
print("DPU cycles:", int.from_bytes(nbCycles[0], 'little'))
print("DPU time:", "{:.2e}".format((float)(int.from_bytes(nbCycles[0], 'little')) / int.from_bytes(clocks_per_sec[0], 'little')), "secs.")
print("Host elapsed time:", "{:.2e}".format(end - start), "secs.")
On DPU v1B hardware, it is possible to count both clock cycles and executed number of instructions at the same time.
#include <perfcounter.h>
#include <stdio.h>
int main() {
perfcounter_pair_t res;
/* volatile on the loop variable as to not be optimized out. */
volatile int loop;
perfcounter_config(COUNT_ENABLE_BOTH, /* reset_counter =*/ true);
for (int iter = 0; iter < 5; iter++) {
loop = 1e7;
while (loop)
loop--;
res = perfcounter_get_both(/* reset_counter =*/ true);
printf("cycles = %lu, instr = %lu => CPI=%f or IPC=%f\n",
res.cycles, res.instr,
(double)res.cycles / res.instr,
(double)res.instr / res.cycles);
}
return 0;
}
BANK INTERFACE PMC (v1B only)
The host library provides several functions for manipulating bank interface performance monitoring counters, briefly outlined below. A code sample demonstrating their usage follows.
All functions take the specific DPU, to which the function will apply, as the first parameter.
To enable and configure these counters, the function dpu_bank_interface_pmc_enable is employed. The second parameter specifies the desired configuration.
The counters can be configured to either count two 32-bit values or one 64-bit value. Calling this function resets the counters.
The possible values for configuration are as follows:
BANK_INTERFACE_PMC_LDMA_INSTRUCTION: counts the bank ACTIVATE row commands issued byLDMAinstructions by the DPU
BANK_INTERFACE_PMC_SDMA_INSTRUCTION: counts the bank ACTIVATE row commands issued bySDMAinstructions by the DPU
BANK_INTERFACE_PMC_READ_64BIT_INSTRUCTION: counts the number of 64-bit values read by the DPU
BANK_INTERFACE_PMC_WRITE_64BIT_INSTRUCTION: counts the number of 64-bit values written by the DPU
BANK_INTERFACE_PMC_HOST_ACTIVATE_COMMAND: counts the number of ACTIVATE commands issued by the host
BANK_INTERFACE_PMC_HOST_REFRESH_COMMAND: counts the number of REFRESH commands issued by the host
BANK_INTERFACE_PMC_ROW_HAMMER_REFRESH_COMMAND: counts the RowHammer refresh protection commands issued by the bank interface
BANK_INTERFACE_PMC_CYCLE: counts clock cycles
To stop the counters, use the function dpu_bank_interface_pmc_stop_counters.
Use the function dpu_bank_interface_pmc_read_counters to retrieve the counter values. The second parameter is a pointer where the result will be stored. The result is stored as an union of structs for easy access.
To disable and deactivate the bank interface PMC module, use the function dpu_bank_interface_pmc_disable. This helps avoid unnecessary energy consumption.
Code example (host side)
Below is a simple code example showing how to configure and enable bank interface PMCs.
#include <stdio.h>
#include <stdlib.h>
#include "dpu.h"
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "USAGE: %s <string:path to dpu_bin> <int:pmc mode, either 32 or 64\n", argv[0]);
exit(EXIT_FAILURE);
}
int pmc_mode = atoi(argv[2]);
struct dpu_set_t set;
struct dpu_set_t dpu;
bank_interface_pmc_config_t pmc_configuration;
switch (pmc_mode) {
case 32: {
pmc_configuration.mode = BANK_INTERFACE_PMC_32BIT_MODE;
pmc_configuration.counter_1 = BANK_INTERFACE_PMC_LDMA_INSTRUCTION;
pmc_configuration.counter_2 = BANK_INTERFACE_PMC_SDMA_INSTRUCTION;
break;
}
case 64: {
pmc_configuration.mode = BANK_INTERFACE_PMC_64BIT_MODE;
pmc_configuration.counter_1 = BANK_INTERFACE_PMC_LDMA_INSTRUCTION;
pmc_configuration.counter_2 = BANK_INTERFACE_PMC_LDMA_INSTRUCTION;
break;
}
default: {
fprintf(stderr, "Invalid mode for bank interface PMC.\n");
exit(EXIT_FAILURE);
}
}
/* these are other possible configurations */
/* pmc_configuration.mode = BANK_INTERFACE_PMC_32BIT_MODE; */
/* pmc_configuration.counter_1 = BANK_INTERFACE_PMC_READ_64BIT_INSTRUCTION; */
/* pmc_configuration.counter_2 = BANK_INTERFACE_PMC_WRITE_64BIT_INSTRUCTION; */
/* pmc_configuration.mode = BANK_INTERFACE_PMC_32BIT_MODE; */
/* pmc_configuration.counter_1 = BANK_INTERFACE_PMC_HOST_ACTIVATE_COMMAND */
/* pmc_configuration.counter_2 = BANK_INTERFACE_PMC_HOST_REFRESH_COMMAND; */
/* pmc_configuration.mode = BANK_INTERFACE_PMC_32BIT_MODE; */
/* pmc_configuration.counter_1 = BANK_INTERFACE_PMC_CYCLES; */
/* pmc_configuration.counter_2 = BANK_INTERFACE_PMC_NONE; */
/* pmc_configuration.mode = BANK_INTERFACE_PMC_64BIT_MODE; */
/* pmc_configuration.counter_1 = BANK_INTERFACE_PMC_CYCLES; */
/* pmc_configuration.counter_2 = BANK_INTERFACE_PMC_CYCLES; */
DPU_ASSERT(dpu_alloc(1, NULL, &set));
/* printf("host: enable counter\n"); */
DPU_FOREACH(set, dpu) {
DPU_ASSERT(dpu_bank_interface_pmc_enable(dpu, pmc_configuration));
}
DPU_FOREACH(set, dpu) {
/* printf("host: load\n"); */
DPU_ASSERT(dpu_load(dpu, argv[1], NULL));
/* printf("host: launch\n"); */
DPU_ASSERT(dpu_launch(dpu, DPU_SYNCHRONOUS));
}
/* printf("host: Stop counters \n"); */
DPU_FOREACH(set, dpu) {
DPU_ASSERT(dpu_bank_interface_pmc_stop_counters(dpu));
}
/* printf("host: PMC results:\n"); */
DPU_FOREACH(set, dpu) {
bank_interface_pmc_result_t result_pmc;
DPU_ASSERT(dpu_bank_interface_pmc_read_counters(dpu, &result_pmc));
if (pmc_configuration.mode == BANK_INTERFACE_PMC_32BIT_MODE) {
printf("counter_1 = 0x%x => %u\n", pmc_configuration.counter_1, result_pmc.two_32bits.counter_1);
printf("counter_2 = 0x%x => %u\n", pmc_configuration.counter_2, result_pmc.two_32bits.counter_2);
} else {
printf("counter_1 = 0x%x => %lu\n", pmc_configuration.counter_1, result_pmc.one_64bits.counter_1);
}
}
/* printf("host: disable counter Module \n"); */
DPU_FOREACH(set, dpu) {
DPU_ASSERT(dpu_bank_interface_pmc_disable(dpu));
}
DPU_ASSERT(dpu_free(set));
return 0;
}
Code example (dpu side)
The next code fragment is a simple DPU kernel that performs read and write accesses from the DPU to its MRAM. The decompiled binary exhibits three LDMA instructions and two SDMA instructions.
#include <stdint.h>
#include <mram.h>
volatile __mram uint64_t a;
volatile __mram uint64_t b;
int main() {
a = 2;
b = 1;
b += a;
return 0;
}
80000058 <main>:
80000058: 00 00 00 00 63 60 00 00 move r0, 0
80000060: 00 00 20 00 eb 51 00 00 move.s d2, 2
80000068: 00 20 8e 00 ff 00 00 00 add r1, id8, 1256
80000070: 00 00 05 46 04 7c 00 00 sd r1, 0, d2
80000078: 02 00 00 00 04 70 00 00 sdma r1, r0, 0
80000080: 00 00 80 00 63 61 00 00 move r2, 8
80000088: 00 00 10 00 eb 52 00 00 move.s d4, 1
80000090: 00 00 09 46 04 7c 00 00 sd r1, 0, d4
80000098: 02 00 04 00 04 70 00 00 sdma r1, r2, 0
800000a0: 00 00 00 00 04 70 00 00 ldma r1, r0, 0
800000a8: 00 00 00 46 07 72 00 00 ld d4, r1, 0
800000b0: 00 00 04 00 04 70 00 00 ldma r1, r2, 0
800000b8: 00 00 00 46 07 73 00 00 ld d6, r1, 0
800000c0: 00 00 ca 00 9c 0e 00 00 add r5, r7, r5
800000c8: 00 00 c8 20 18 0e 00 00 addc r4, r6, r4
800000d0: 00 00 09 46 04 7c 00 00 sd r1, 0, d4
800000d8: 00 00 00 00 63 60 00 00 move r0, 0
800000e0: 02 00 04 00 04 70 00 00 sdma r1, r2, 0
800000e8: 00 00 00 00 5f 8c 00 00 jump r23
When running this kernel with the above host program in 32-bit mode, the expected output is:
counter_1 = 0x1 => 2
counter_2 = 0x2 => 3