Communication with host applications

Memory Interface

The C host API provides functions to transfer data between the host memory and any of the DPU memory (IRAM, WRAM or MRAM):

dpu_copy_from(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, void *dst, size_t length) to copy a buffer from a single DPU

dpu_broadcast_to(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, const void *src, size_t length, dpu_xfer_flags_t flags) to broadcast a buffer to a set of DPUs

dpu_push_xfer(struct dpu_set_t set, dpu_xfer_t xfer, const char *symbol_name, uint32_t symbol_offset, size_t length, dpu_xfer_flags_t flags) to push different buffers to a set of DPUs in one transfer.

There are some alignment limitations when using these functions, depending on the target DPU memory:

IRAM address and length must be aligned on 8 bytes
WRAM address and length must be aligned on 4 bytes
MRAM address and length must be aligned on 8 bytes

The functions will return an error if these constraints are not respected.

The symbol_name argument consists of a name of a variable in the DPU code. It can be either a MRAM variable (with the __mram or __mram_noinit attribute) or a WRAM variable (with the __host attribute). Other variables are not visible to the host application. (Note: Before you use WRAM transfers, read the Data sharing section.)

Note: The special MRAM variable DPU_MRAM_HEAP_POINTER (cf The DPU MRAM Heap Pointer) can be accessed by specifying DPU_MRAM_HEAP_POINTER_NAME (defined in dpu_types.h) as the symbol_name.

When the DPU set contains multiple DPUs:

dpu_broadcast_to will copy the same buffer to all DPUs in the set

dpu_copy_from will return DPU_ERR_INVALID_DPU_SET

dpu_push_xfer: see Section Rank Transfer Interface

As an illustration, let’s implement a trivial checksum function in the DPU. The host application fills in the MRAM with a buffer of arbitrary size:

The first 4 bytes in MRAM represent the buffer size N

The subsequent N bytes in MRAM contain the data for which the application requests a checksum computation

On the DPU side, the program uses a single tasklet to fetch N and compute the checksum of the supplied buffer. When done, the result is posted back into the first four bytes of the MRAM.

Next is a very simple way of implementing the code on the DPU side, using a mix of MRAM variables and low level MRAM/WRAM access functions (in trivial_checksum_example.c):

#include <mram.h>
#include <stdbool.h>
#include <stdint.h>

#define CACHE_SIZE 256
#define BUFFER_SIZE (1 << 16)

__mram_noinit uint8_t buffer[BUFFER_SIZE];
__host uint32_t checksum;

int main() {
  __dma_aligned uint8_t local_cache[CACHE_SIZE];
  checksum = 0;

  for (unsigned int bytes_read = 0; bytes_read < BUFFER_SIZE;) {
    mram_read(&buffer[bytes_read], local_cache, CACHE_SIZE);

    for (unsigned int byte_index = 0; (byte_index < CACHE_SIZE) && (bytes_read < BUFFER_SIZE); byte_index++, bytes_read++) {
      checksum += (uint32_t)local_cache[byte_index];
    }
  }

  return checksum;
}

The code is built to be executed by a single tasklet:

dpu-upmem-dpurte-clang trivial_checksum_example.c -o trivial_checksum_example

Such a code can be tested with the dpu-lldb, by loading a pre-defined MRAM image…

Such an image is a binary file forged by the developers. For example, to load an MRAM image called sample.bin and run the checksum computation on it:

file trivial_checksum_example
breakpoint set --source-pattern-regexp "return checksum;"
process launch --stop-at-entry
memory write -i sample.bin '&buffer[0]'
process continue
frame variable/x checksum
exit

As usual, the print of the checksum variable allows to verify that the returned value is correct:

(uint32_t) checksum = 0x007f8000

A host application can trigger the checksum computation by filling the MRAM with the data, as illustrated here-after:

/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPU to */
/* compute the checksum. */

#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>

#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif

/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)

void populate_mram(struct dpu_set_t set) {
  uint8_t buffer[BUFFER_SIZE];

  for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
    buffer[byte_index] = (uint8_t)byte_index;
  }
  DPU_ASSERT(dpu_broadcast_to(set, "buffer", 0, buffer, BUFFER_SIZE, DPU_XFER_DEFAULT));
}

int main() {
  struct dpu_set_t set, dpu;
  uint32_t checksum;

  DPU_ASSERT(dpu_alloc(1, NULL, &set));
  DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
  populate_mram(set);

  DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
  DPU_FOREACH(set, dpu) {
    DPU_ASSERT(dpu_copy_from(dpu, "checksum", 0, (uint8_t *)&checksum, sizeof(checksum)));
    printf("Computed checksum = 0x%08x\n", checksum);
  }
  DPU_ASSERT(dpu_free(set));
  return 0;
}

#include <dpu>
#include <iostream>
#include <iomanip>

using namespace dpu;

/* Size of the buffer for which we compute the checksum: 64KBytes. */
static constexpr int32_t BUFFER_SIZE = 1 << 16;

void populate_mram(DpuSetOps & dpu) {

  std::vector<uint8_t> buffer(BUFFER_SIZE, 0);
  int byte_index = 0;
  for (auto & e : buffer) 
    e = (uint8_t)byte_index++;

  dpu.copy("buffer", buffer, static_cast<unsigned>(BUFFER_SIZE));
}

int main(int argc, char **argv) {

  try {
    auto system = DpuSet::allocate(1);
    auto dpu = system.dpus()[0];
    dpu->load("trivial_checksum_example");
    populate_mram(*dpu);
    dpu->exec();
    
    std::vector<std::vector<uint32_t>> checksum(1);
    checksum.front().resize(1);
    dpu->copy(checksum, "checksum");
    std::cout << "Computed checksum = 0x" << std::hex << std::setfill('0') << std::setw(8) 
      << checksum.front().front() << std::endl;
  } catch (const DpuError & e) {
    std::cerr << e.what() << std::endl;
  }
}

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.upmem.dpu.Dpu;
import com.upmem.dpu.DpuException;
import com.upmem.dpu.DpuSystem;

public class TrivialChecksumExample {

  private static final int BUFFER_SIZE = 1 << 16;

  private static void populate_mram(Dpu dpu) throws DpuException {

    byte[] buffer = new byte[BUFFER_SIZE];
    for (int i = 0; i < BUFFER_SIZE; ++i)
      buffer[i] = (byte)i;
    dpu.copy("buffer", buffer);
  }

  public static void main(String[] args) throws DpuException {
    try(DpuSystem system = DpuSystem.allocate(1, "")) {
      Dpu dpu = system.dpus().get(0);

      dpu.load("trivial_checksum_example");
      populate_mram(dpu);
      dpu.exec();

      byte[] checksum = new byte[4];
      dpu.copy(checksum, "checksum");
      ByteBuffer wrapped = ByteBuffer.wrap(checksum);
      wrapped.order(ByteOrder.LITTLE_ENDIAN);

      System.out.println(String.format("Computed checksum = 0x%08x", wrapped.getInt()));
    }
  }
}

#!/usr/bin/env python3

from dpu import DpuSet
from dpu import ALLOCATE_ALL

BUFFER_SIZE = 1 << 16

def populate_mram(dpus):

    buffer = bytearray([i.to_bytes(4, 'little')[0] for i in range(BUFFER_SIZE)])
    dpus.copy('buffer', buffer)


with DpuSet(1, binary = "trivial_checksum_example") as dpus:

    populate_mram(dpus)
    dpus.exec()

    checksum = [bytearray(4) for _ in dpus]
    dpus.copy(checksum, 'checksum')
    print("Computed checksum = 0x%08x" %(int.from_bytes(checksum[0], 'little')))

Note: In C++, Java and Python, a copy method is used for the data transfers between the host and the DPU, instead of the dpu_copy_from and dpu_broadcast_to functions used in C.

Compile the program, for example:

gcc -O3 --std=c99 -o trivial_checksum_example_host trivial_checksum_example_host.c -g `dpu-pkg-config --cflags --libs dpu`

g++ --std=c++11 trivial_checksum_example_host.cpp -o trivial_checksum_example_host_cpp `dpu-pkg-config --cflags --libs dpu` -g

javac -cp $(dpu-pkg-config --variable=java dpu) TrivialChecksumExample.java

The result printed by this program should be the checksum of 64Kbyte of counting bytes:

Computed checksum = 0x007f8000

Rank Transfer Interface

The previous functions do not provide the needed precision when trying to transfer different data from/to the DPUs while keeping the performance of transferring to a whole rank. To do so, one can use the following C functions:

dpu_prepare_xfer attributes a buffer to a set of DPUs, which will be used as input or output when dpu_push_xfer is called

dpu_push_xfer executes the current transfer with the given direction, DPU symbol name, and DPU symbol length, using the buffers defined with dpu_prepare_xfer. No transfer is done for a DPU with no defined buffer.

Here is an example doing the same computation as before, but using multiple DPUs:

/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPUs to */
/* compute the checksums. */

#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>

#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif

/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)

void populate_mram(struct dpu_set_t set, uint32_t nr_dpus) {
  struct dpu_set_t dpu;
  uint32_t each_dpu;
  uint8_t *buffer = malloc(BUFFER_SIZE * nr_dpus);
  DPU_FOREACH(set, dpu, each_dpu) {
    for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
      buffer[each_dpu * BUFFER_SIZE + byte_index] = (uint8_t)byte_index;
    }
    buffer[each_dpu * BUFFER_SIZE] += each_dpu; // each dpu will compute a different checksum
    DPU_ASSERT(dpu_prepare_xfer(dpu, &buffer[each_dpu * BUFFER_SIZE]));
  }
  DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "buffer", 0, BUFFER_SIZE, DPU_XFER_DEFAULT));
  free(buffer);
}

void print_checksums(struct dpu_set_t set, uint32_t nr_dpus) {
  struct dpu_set_t dpu;
  uint32_t each_dpu;
  uint32_t checksums[nr_dpus];
  DPU_FOREACH(set, dpu, each_dpu) {
    DPU_ASSERT(dpu_prepare_xfer(dpu, &checksums[each_dpu]));
  }
  DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, "checksum", 0, sizeof(uint32_t), DPU_XFER_DEFAULT));

  DPU_FOREACH(set, dpu, each_dpu) {
    printf("[%u] computed checksum = 0x%08x\n", each_dpu, checksums[each_dpu]);
  }
}

int main() {
  struct dpu_set_t set, dpu;
  uint32_t nr_dpus;

  DPU_ASSERT(dpu_alloc(DPU_ALLOCATE_ALL, NULL, &set));
  DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
  DPU_ASSERT(dpu_get_nr_dpus(set, &nr_dpus));
  populate_mram(set, nr_dpus);

  DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
  print_checksums(set, nr_dpus);

  DPU_ASSERT(dpu_free(set));
  return 0;
}

#include <dpu>
#include <iostream>
#include <iomanip>

using namespace dpu;

/* Size of the buffer for which we compute the checksum: 64KBytes. */
static constexpr int32_t BUFFER_SIZE = 1 << 16;

void populate_mram(DpuSet & system) {

  std::vector<std::vector<uint8_t>> buffer(system.dpus().size(), 
      std::vector<uint8_t>(BUFFER_SIZE));
  for(unsigned i = 0; i < system.dpus().size(); ++i) {
    for(int j = 0; j < BUFFER_SIZE; ++j)
      buffer[i][j] = (uint8_t)j;
    buffer[i][0] += i;
  }

  system.copy("buffer", buffer);
}

void print_checksums(DpuSet & system) {

  std::vector<std::vector<uint32_t>> checksum(system.dpus().size(),
      std::vector<uint32_t>(1));
  system.copy(checksum, "checksum");
  unsigned i = 0;
  for(auto const & e : checksum) {
    std::cout << std::dec << std::setw(0) 
      << '[' << i++ << "] computed checksum = 0x" 
      << std::hex << std::setfill('0') << std::setw(8) 
      << e.front() << std::endl;
  }
}

int main(int argc, char **argv) {

  try {
    auto system = DpuSet::allocate(ALLOCATE_ALL);
    system.load("trivial_checksum_example");
    populate_mram(system);
    system.exec();
    print_checksums(system);
  } catch (const DpuError & e) {
    std::cerr << e.what() << std::endl;
  }
  return 0;
}

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.upmem.dpu.Dpu;
import com.upmem.dpu.DpuException;
import com.upmem.dpu.DpuSystem;

public class TrivialChecksumExampleMultiRank {

  private static final int BUFFER_SIZE = 1 << 16;

  private static void populate_mram(DpuSystem system) throws DpuException {

    byte[][] buffers = new byte[system.dpus().size()][BUFFER_SIZE];
    for(int i = 0; i < system.dpus().size(); ++i) {
      for (int j = 0; j < BUFFER_SIZE; ++j)
        buffers[i][j]= (byte)j;
      buffers[i][0] += i;
    }
    system.copy("buffer", buffers);
  }

  private static void print_checksums(DpuSystem system) throws DpuException {

    byte[][] checksum = new byte[system.dpus().size()][4];
    system.copy(checksum, "checksum");
    for(int dpu = 0; dpu < system.dpus().size(); ++dpu) {
      ByteBuffer wrapped = ByteBuffer.wrap(checksum[dpu]);
      wrapped.order(ByteOrder.LITTLE_ENDIAN);
      System.out.println(String.format("[%d] computed checksum = 0x%08x", 
            dpu, wrapped.getInt()));
    }
  }

  public static void main(String[] args) throws DpuException {
    try(DpuSystem system = DpuSystem.allocate(DpuSystem.ALLOCATE_ALL, "")) {

      system.load("trivial_checksum_example");
      populate_mram(system);
      system.exec();
      print_checksums(system);
    }
  }
}

#!/usr/bin/env python3

from dpu import DpuSet
from dpu import ALLOCATE_ALL

BUFFER_SIZE = 1 << 16

def populate_mram(dpus):

    buffers = [bytearray([k.to_bytes(4, 'little')[0]] + 
        [i.to_bytes(4, 'little')[0] for i in range(1, BUFFER_SIZE)]) for k in range(len(dpus))]
    dpus.copy('buffer', buffers)


with DpuSet(ALLOCATE_ALL, binary = "trivial_checksum_example") as dpus:

    populate_mram(dpus)
    dpus.exec()

    checksum = [bytearray(4) for _ in dpus]
    dpus.copy(checksum, 'checksum')
    for i in range(len(dpus)):
        print("[%d] computed checksum = 0x%08x" %(i, int.from_bytes(checksum[i], 'little')))

Note: In C++, Java and Python, the same copy method is used for the data transfers between the host and a rank of DPUs. However this method is an overload of the copy method used in the single DPU example, as it takes a two dimensional vector as input. The first dimension of the vector corresponds to each DPU in the rank.