Communication with host applications

Memory Interface

The C host API provides functions to transfer data between the host memory and any of the DPU memory (IRAM, WRAM or MRAM):

  • dpu_copy_from(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, void *dst, size_t length) to copy a buffer from a single DPU

  • dpu_broadcast_to(struct dpu_set_t set, const char *symbol_name, uint32_t symbol_offset, const void *src, size_t length, dpu_xfer_flags_t flags) to broadcast a buffer to a set of DPUs

  • dpu_push_xfer(struct dpu_set_t set, dpu_xfer_t xfer, const char *symbol_name, uint32_t symbol_offset, size_t length, dpu_xfer_flags_t flags) to push different buffers to a set of DPUs in one transfer.

There are some alignment limitations when using these functions, depending on the target DPU memory:
  • IRAM address and length must be aligned on 8 bytes

  • WRAM address and length must be aligned on 4 bytes

  • MRAM address and length must be aligned on 8 bytes

The functions will return an error if these constraints are not respected.

The symbol_name argument consists of a name of a variable in the DPU code. It can be either a MRAM variable (with the __mram or __mram_noinit attribute) or a WRAM variable (with the __host attribute). Other variables are not visible to the host application. (Note: Before you use WRAM transfers, read the Data sharing section.)

Note: The special MRAM variable DPU_MRAM_HEAP_POINTER (cf The DPU MRAM Heap Pointer) can be accessed by specifying DPU_MRAM_HEAP_POINTER_NAME (defined in dpu_types.h) as the symbol_name.

When the DPU set contains multiple DPUs:

  • dpu_broadcast_to will copy the same buffer to all DPUs in the set

  • dpu_copy_from will return DPU_ERR_INVALID_DPU_SET

  • dpu_push_xfer: see Section Rank Transfer Interface

As an illustration, let’s implement a trivial checksum function in the DPU. The host application fills in the MRAM with a buffer of arbitrary size:

  • The first 4 bytes in MRAM represent the buffer size N

  • The subsequent N bytes in MRAM contain the data for which the application requests a checksum computation

On the DPU side, the program uses a single tasklet to fetch N and compute the checksum of the supplied buffer. When done, the result is posted back into the first four bytes of the MRAM.

Next is a very simple way of implementing the code on the DPU side, using a mix of MRAM variables and low level MRAM/WRAM access functions (in trivial_checksum_example.c):

#include <mram.h>
#include <stdbool.h>
#include <stdint.h>

#define CACHE_SIZE 256
#define BUFFER_SIZE (1 << 16)

__mram_noinit uint8_t buffer[BUFFER_SIZE];
__host uint32_t checksum;

int main() {
  __dma_aligned uint8_t local_cache[CACHE_SIZE];
  checksum = 0;

  for (unsigned int bytes_read = 0; bytes_read < BUFFER_SIZE;) {
    mram_read(&buffer[bytes_read], local_cache, CACHE_SIZE);

    for (unsigned int byte_index = 0; (byte_index < CACHE_SIZE) && (bytes_read < BUFFER_SIZE); byte_index++, bytes_read++) {
      checksum += (uint32_t)local_cache[byte_index];
    }
  }

  return checksum;
}

The code is built to be executed by a single tasklet:

dpu-upmem-dpurte-clang trivial_checksum_example.c -o trivial_checksum_example

Such a code can be tested with the dpu-lldb, by loading a pre-defined MRAM image…

Such an image is a binary file forged by the developers. For example, to load an MRAM image called sample.bin and run the checksum computation on it:

file trivial_checksum_example
breakpoint set --source-pattern-regexp "return checksum;"
process launch --stop-at-entry
memory write -i sample.bin '&buffer[0]'
process continue
frame variable/x checksum
exit

As usual, the print of the checksum variable allows to verify that the returned value is correct:

(uint32_t) checksum = 0x007f8000

A host application can trigger the checksum computation by filling the MRAM with the data, as illustrated here-after:

/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPU to */
/* compute the checksum. */

#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>

#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif

/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)

void populate_mram(struct dpu_set_t set) {
  uint8_t buffer[BUFFER_SIZE];

  for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
    buffer[byte_index] = (uint8_t)byte_index;
  }
  DPU_ASSERT(dpu_broadcast_to(set, "buffer", 0, buffer, BUFFER_SIZE, DPU_XFER_DEFAULT));
}

int main() {
  struct dpu_set_t set, dpu;
  uint32_t checksum;

  DPU_ASSERT(dpu_alloc(1, NULL, &set));
  DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
  populate_mram(set);

  DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
  DPU_FOREACH(set, dpu) {
    DPU_ASSERT(dpu_copy_from(dpu, "checksum", 0, (uint8_t *)&checksum, sizeof(checksum)));
    printf("Computed checksum = 0x%08x\n", checksum);
  }
  DPU_ASSERT(dpu_free(set));
  return 0;
}

Note: In C++, Java and Python, a copy method is used for the data transfers between the host and the DPU, instead of the dpu_copy_from and dpu_broadcast_to functions used in C.

Compile the program, for example:

gcc -O3 --std=c99 -o trivial_checksum_example_host trivial_checksum_example_host.c -g `dpu-pkg-config --cflags --libs dpu`

The result printed by this program should be the checksum of 64Kbyte of counting bytes:

Computed checksum = 0x007f8000

Rank Transfer Interface

The previous functions do not provide the needed precision when trying to transfer different data from/to the DPUs while keeping the performance of transferring to a whole rank. To do so, one can use the following C functions:

  • dpu_prepare_xfer attributes a buffer to a set of DPUs, which will be used as input or output when dpu_push_xfer is called

  • dpu_push_xfer executes the current transfer with the given direction, DPU symbol name, and DPU symbol length, using the buffers defined with dpu_prepare_xfer. No transfer is done for a DPU with no defined buffer.

Here is an example doing the same computation as before, but using multiple DPUs:

/* Communication with a DPU via the MRAM. */
/* Populate the MRAM with a collection of bytes and request the DPUs to */
/* compute the checksums. */

#include <dpu.h>
#include <assert.h>
#include <stdint.h>
#include <stdio.h>

#ifndef DPU_BINARY
#define DPU_BINARY "trivial_checksum_example"
#endif

/* Size of the buffer for which we compute the checksum: 64KBytes. */
#define BUFFER_SIZE (1 << 16)

void populate_mram(struct dpu_set_t set, uint32_t nr_dpus) {
  struct dpu_set_t dpu;
  uint32_t each_dpu;
  uint8_t *buffer = malloc(BUFFER_SIZE * nr_dpus);
  DPU_FOREACH(set, dpu, each_dpu) {
    for (int byte_index = 0; byte_index < BUFFER_SIZE; byte_index++) {
      buffer[each_dpu * BUFFER_SIZE + byte_index] = (uint8_t)byte_index;
    }
    buffer[each_dpu * BUFFER_SIZE] += each_dpu; // each dpu will compute a different checksum
    DPU_ASSERT(dpu_prepare_xfer(dpu, &buffer[each_dpu * BUFFER_SIZE]));
  }
  DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "buffer", 0, BUFFER_SIZE, DPU_XFER_DEFAULT));
  free(buffer);
}

void print_checksums(struct dpu_set_t set, uint32_t nr_dpus) {
  struct dpu_set_t dpu;
  uint32_t each_dpu;
  uint32_t checksums[nr_dpus];
  DPU_FOREACH(set, dpu, each_dpu) {
    DPU_ASSERT(dpu_prepare_xfer(dpu, &checksums[each_dpu]));
  }
  DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, "checksum", 0, sizeof(uint32_t), DPU_XFER_DEFAULT));

  DPU_FOREACH(set, dpu, each_dpu) {
    printf("[%u] computed checksum = 0x%08x\n", each_dpu, checksums[each_dpu]);
  }
}

int main() {
  struct dpu_set_t set, dpu;
  uint32_t nr_dpus;

  DPU_ASSERT(dpu_alloc(DPU_ALLOCATE_ALL, NULL, &set));
  DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
  DPU_ASSERT(dpu_get_nr_dpus(set, &nr_dpus));
  populate_mram(set, nr_dpus);

  DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
  print_checksums(set, nr_dpus);

  DPU_ASSERT(dpu_free(set));
  return 0;
}

Note: In C++, Java and Python, the same copy method is used for the data transfers between the host and a rank of DPUs. However this method is an overload of the copy method used in the single DPU example, as it takes a two dimensional vector as input. The first dimension of the vector corresponds to each DPU in the rank.