Communication with host applications
Memory Interface
The C host API provides functions to transfer data between the host memory and any of the DPU memory (IRAM, WRAM or MRAM):
dpu_copy_from(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, void *dst, size_t length)to copy a buffer from a single DPU
dpu_broadcast_to(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, const void *src, size_t length, dpu_xfer_flags_t flags)to broadcast a buffer to a set of DPUs
dpu_push_xfer(struct dpu_set_t set, dpu_xfer_t xfer, const char *symbol_name, uint32_t symbol_offset, size_t length, dpu_xfer_flags_t flags)to push different buffers to a set of DPUs in one transfer.
- There are some alignment limitations when using these functions, depending on the target DPU memory:
IRAM address and length must be aligned on 8 bytes
WRAM address and length must be aligned on 4 bytes
MRAM address and length must be aligned on 8 bytes
The functions will return an error if these constraints are not respected.
The symbol_name argument consists of a name of a variable in the DPU code.
It can be either a MRAM variable (with the __mram or __mram_noinit attribute) or a WRAM variable (with the __host attribute).
Other variables are not visible to the host application. (Note: Before you use WRAM transfers, read the Data sharing section.)
Note:
The special MRAM variable DPU_MRAM_HEAP_POINTER (cf The DPU MRAM Heap Pointer) can be accessed by specifying DPU_MRAM_HEAP_POINTER_NAME (defined in dpu_types.h) as the symbol_name.
When the DPU set contains multiple DPUs:
dpu_broadcast_towill copy the same buffer to all DPUs in the set
dpu_copy_fromwill returnDPU_ERR_INVALID_DPU_SET
dpu_push_xfer: see Section Rank Transfer Interface
As an illustration, let’s implement a trivial checksum function in the DPU. The host application fills in the MRAM with a buffer of arbitrary size:
The first 4 bytes in MRAM represent the buffer size
NThe subsequent
Nbytes in MRAM contain the data for which the application requests a checksum computation
On the DPU side, the program uses a single tasklet to fetch N and compute the checksum of the supplied buffer. When
done, the result is posted back into the first four bytes of the MRAM.
Next is a very simple way of implementing the code on the DPU side, using a mix of MRAM variables and low level MRAM/WRAM access functions (in trivial_checksum_example.c):
#include <mram.h>
#include <stdbool.h>
#include <stdint.h>
#define CACHE_SIZE 256
#define BUFFER_SIZE (1 << 16)
__mram_noinit uint8_t buffer[BUFFER_SIZE];
__host uint32_t checksum;
int main() {
__dma_aligned uint8_t local_cache[CACHE_SIZE];
checksum = 0;
for (unsigned int bytes_read = 0; bytes_read < BUFFER_SIZE;) {
mram_read(&buffer[bytes_read], local_cache, CACHE_SIZE);
for (unsigned int byte_index = 0; (byte_index < CACHE_SIZE) && (bytes_read < BUFFER_SIZE); byte_index++, bytes_read++) {
checksum += (uint32_t)local_cache[byte_index];
}
}
return checksum;
}
The code is built to be executed by a single tasklet:
dpu-upmem-dpurte-clang trivial_checksum_example.c -o trivial_checksum_example
Such a code can be tested with the dpu-lldb, by loading a pre-defined MRAM image…
Such an image is a binary file forged by the developers. For example, to load an MRAM image called sample.bin and run the checksum computation on it:
file trivial_checksum_example
breakpoint set --source-pattern-regexp "return checksum;"
process launch --stop-at-entry
memory write -i sample.bin '&buffer[0]'
process continue
frame variable/x checksum
exit
As usual, the print of the checksum variable allows to verify that the returned value is correct:
(uint32_t) checksum = 0x007f8000
A host application can trigger the checksum computation by filling the MRAM with the data, as illustrated here-after:
/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPU to */
/* compute the checksum. */
#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif
/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)
void populate_mram(struct dpu_set_t set) {
uint8_t buffer[BUFFER_SIZE];
for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
buffer[byte_index] = (uint8_t)byte_index;
}
DPU_ASSERT(dpu_broadcast_to(set, "buffer", 0, buffer, BUFFER_SIZE, DPU_XFER_DEFAULT));
}
int main() {
struct dpu_set_t set, dpu;
uint32_t checksum;
DPU_ASSERT(dpu_alloc(1, NULL, &set));
DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
populate_mram(set);
DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
DPU_FOREACH(set, dpu) {
DPU_ASSERT(dpu_copy_from(dpu, "checksum", 0, (uint8_t *)&checksum, sizeof(checksum)));
printf("Computed checksum = 0x%08x\n", checksum);
}
DPU_ASSERT(dpu_free(set));
return 0;
}
#include <dpu>
#include <iostream>
#include <iomanip>
using namespace dpu;
/* Size of the buffer for which we compute the checksum: 64KBytes. */
static constexpr int32_t BUFFER_SIZE = 1 << 16;
void populate_mram(DpuSetOps & dpu) {
std::vector<uint8_t> buffer(BUFFER_SIZE, 0);
int byte_index = 0;
for (auto & e : buffer)
e = (uint8_t)byte_index++;
dpu.copy("buffer", buffer, static_cast<unsigned>(BUFFER_SIZE));
}
int main(int argc, char **argv) {
try {
auto system = DpuSet::allocate(1);
auto dpu = system.dpus()[0];
dpu->load("trivial_checksum_example");
populate_mram(*dpu);
dpu->exec();
std::vector<std::vector<uint32_t>> checksum(1);
checksum.front().resize(1);
dpu->copy(checksum, "checksum");
std::cout << "Computed checksum = 0x" << std::hex << std::setfill('0') << std::setw(8)
<< checksum.front().front() << std::endl;
} catch (const DpuError & e) {
std::cerr << e.what() << std::endl;
}
}
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.upmem.dpu.Dpu;
import com.upmem.dpu.DpuException;
import com.upmem.dpu.DpuSystem;
public class TrivialChecksumExample {
private static final int BUFFER_SIZE = 1 << 16;
private static void populate_mram(Dpu dpu) throws DpuException {
byte[] buffer = new byte[BUFFER_SIZE];
for (int i = 0; i < BUFFER_SIZE; ++i)
buffer[i] = (byte)i;
dpu.copy("buffer", buffer);
}
public static void main(String[] args) throws DpuException {
try(DpuSystem system = DpuSystem.allocate(1, "")) {
Dpu dpu = system.dpus().get(0);
dpu.load("trivial_checksum_example");
populate_mram(dpu);
dpu.exec();
byte[] checksum = new byte[4];
dpu.copy(checksum, "checksum");
ByteBuffer wrapped = ByteBuffer.wrap(checksum);
wrapped.order(ByteOrder.LITTLE_ENDIAN);
System.out.println(String.format("Computed checksum = 0x%08x", wrapped.getInt()));
}
}
}
#!/usr/bin/env python3
from dpu import DpuSet
from dpu import ALLOCATE_ALL
BUFFER_SIZE = 1 << 16
def populate_mram(dpus):
buffer = bytearray([i.to_bytes(4, 'little')[0] for i in range(BUFFER_SIZE)])
dpus.copy('buffer', buffer)
with DpuSet(1, binary = "trivial_checksum_example") as dpus:
populate_mram(dpus)
dpus.exec()
checksum = [bytearray(4) for _ in dpus]
dpus.copy(checksum, 'checksum')
print("Computed checksum = 0x%08x" %(int.from_bytes(checksum[0], 'little')))
Note: In C++, Java and Python, a copy method is used for the data transfers between the host and the DPU, instead of the dpu_copy_from and dpu_broadcast_to functions used in C.
Compile the program, for example:
gcc -O3 --std=c99 -o trivial_checksum_example_host trivial_checksum_example_host.c -g `dpu-pkg-config --cflags --libs dpu`
g++ --std=c++11 trivial_checksum_example_host.cpp -o trivial_checksum_example_host_cpp `dpu-pkg-config --cflags --libs dpu` -g
javac -cp $(dpu-pkg-config --variable=java dpu) TrivialChecksumExample.java
N/A
The result printed by this program should be the checksum of 64Kbyte of counting bytes:
Computed checksum = 0x007f8000
Rank Transfer Interface
The previous functions do not provide the needed precision when trying to transfer different data from/to the DPUs while keeping the performance of transferring to a whole rank. To do so, one can use the following C functions:
dpu_prepare_xferattributes a buffer to a set of DPUs, which will be used as input or output whendpu_push_xferis called
dpu_push_xferexecutes the current transfer with the given direction, DPU symbol name, and DPU symbol length, using the buffers defined withdpu_prepare_xfer. No transfer is done for a DPU with no defined buffer.
Here is an example doing the same computation as before, but using multiple DPUs:
/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPUs to */
/* compute the checksums. */
#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif
/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)
void populate_mram(struct dpu_set_t set, uint32_t nr_dpus) {
struct dpu_set_t dpu;
uint32_t each_dpu;
uint8_t *buffer = malloc(BUFFER_SIZE * nr_dpus);
DPU_FOREACH(set, dpu, each_dpu) {
for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
buffer[each_dpu * BUFFER_SIZE + byte_index] = (uint8_t)byte_index;
}
buffer[each_dpu * BUFFER_SIZE] += each_dpu; // each dpu will compute a different checksum
DPU_ASSERT(dpu_prepare_xfer(dpu, &buffer[each_dpu * BUFFER_SIZE]));
}
DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "buffer", 0, BUFFER_SIZE, DPU_XFER_DEFAULT));
free(buffer);
}
void print_checksums(struct dpu_set_t set, uint32_t nr_dpus) {
struct dpu_set_t dpu;
uint32_t each_dpu;
uint32_t checksums[nr_dpus];
DPU_FOREACH(set, dpu, each_dpu) {
DPU_ASSERT(dpu_prepare_xfer(dpu, &checksums[each_dpu]));
}
DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, "checksum", 0, sizeof(uint32_t), DPU_XFER_DEFAULT));
DPU_FOREACH(set, dpu, each_dpu) {
printf("[%u] computed checksum = 0x%08x\n", each_dpu, checksums[each_dpu]);
}
}
int main() {
struct dpu_set_t set, dpu;
uint32_t nr_dpus;
DPU_ASSERT(dpu_alloc(DPU_ALLOCATE_ALL, NULL, &set));
DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(set, &nr_dpus));
populate_mram(set, nr_dpus);
DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
print_checksums(set, nr_dpus);
DPU_ASSERT(dpu_free(set));
return 0;
}
#include <dpu>
#include <iostream>
#include <iomanip>
using namespace dpu;
/* Size of the buffer for which we compute the checksum: 64KBytes. */
static constexpr int32_t BUFFER_SIZE = 1 << 16;
void populate_mram(DpuSet & system) {
std::vector<std::vector<uint8_t>> buffer(system.dpus().size(),
std::vector<uint8_t>(BUFFER_SIZE));
for(unsigned i = 0; i < system.dpus().size(); ++i) {
for(int j = 0; j < BUFFER_SIZE; ++j)
buffer[i][j] = (uint8_t)j;
buffer[i][0] += i;
}
system.copy("buffer", buffer);
}
void print_checksums(DpuSet & system) {
std::vector<std::vector<uint32_t>> checksum(system.dpus().size(),
std::vector<uint32_t>(1));
system.copy(checksum, "checksum");
unsigned i = 0;
for(auto const & e : checksum) {
std::cout << std::dec << std::setw(0)
<< '[' << i++ << "] computed checksum = 0x"
<< std::hex << std::setfill('0') << std::setw(8)
<< e.front() << std::endl;
}
}
int main(int argc, char **argv) {
try {
auto system = DpuSet::allocate(ALLOCATE_ALL);
system.load("trivial_checksum_example");
populate_mram(system);
system.exec();
print_checksums(system);
} catch (const DpuError & e) {
std::cerr << e.what() << std::endl;
}
return 0;
}
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.upmem.dpu.Dpu;
import com.upmem.dpu.DpuException;
import com.upmem.dpu.DpuSystem;
public class TrivialChecksumExampleMultiRank {
private static final int BUFFER_SIZE = 1 << 16;
private static void populate_mram(DpuSystem system) throws DpuException {
byte[][] buffers = new byte[system.dpus().size()][BUFFER_SIZE];
for(int i = 0; i < system.dpus().size(); ++i) {
for (int j = 0; j < BUFFER_SIZE; ++j)
buffers[i][j]= (byte)j;
buffers[i][0] += i;
}
system.copy("buffer", buffers);
}
private static void print_checksums(DpuSystem system) throws DpuException {
byte[][] checksum = new byte[system.dpus().size()][4];
system.copy(checksum, "checksum");
for(int dpu = 0; dpu < system.dpus().size(); ++dpu) {
ByteBuffer wrapped = ByteBuffer.wrap(checksum[dpu]);
wrapped.order(ByteOrder.LITTLE_ENDIAN);
System.out.println(String.format("[%d] computed checksum = 0x%08x",
dpu, wrapped.getInt()));
}
}
public static void main(String[] args) throws DpuException {
try(DpuSystem system = DpuSystem.allocate(DpuSystem.ALLOCATE_ALL, "")) {
system.load("trivial_checksum_example");
populate_mram(system);
system.exec();
print_checksums(system);
}
}
}
#!/usr/bin/env python3
from dpu import DpuSet
from dpu import ALLOCATE_ALL
BUFFER_SIZE = 1 << 16
def populate_mram(dpus):
buffers = [bytearray([k.to_bytes(4, 'little')[0]] +
[i.to_bytes(4, 'little')[0] for i in range(1, BUFFER_SIZE)]) for k in range(len(dpus))]
dpus.copy('buffer', buffers)
with DpuSet(ALLOCATE_ALL, binary = "trivial_checksum_example") as dpus:
populate_mram(dpus)
dpus.exec()
checksum = [bytearray(4) for _ in dpus]
dpus.copy(checksum, 'checksum')
for i in range(len(dpus)):
print("[%d] computed checksum = 0x%08x" %(i, int.from_bytes(checksum[i], 'little')))
Note: In C++, Java and Python, the same copy method is used for the data transfers between the host and a rank of DPUs. However this method is an overload of the copy method used in the single DPU example, as it takes a two dimensional vector as input. The first dimension of the vector corresponds to each DPU in the rank.