GAP8-Rust

gap8

Arena allocator for cluster and L2

L2 allocator

unsafe impl Allocator for L2Allocator {
    fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
        let ptr = if layout.align() > L2_ALIGN {
            unsafe { pi_l2_malloc_align(
                layout.size().try_into().map_err(|_| AllocError)?, 
                layout.align()) 
            } as *mut u8
        } else {
            unsafe { pi_l2_malloc(
                layout.size().try_into().map_err(|_| AllocError)?) 
            } as *mut u8
        };

        NonNull::new(ptr)
            .map(|ptr| NonNull::slice_from_raw_parts(ptr, layout.size()))
            .ok_or(AllocError)
    }

    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
        pi_l2_free(ptr.as_ptr() as *mut cty::c_void, layout.size() as i32);
    }
}

Cluster L1 allocator

unsafe impl<'a> Allocator for ClusterAllocator<'a> {
    fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
        let ptr = if layout.align() > CLUSTER_L1_ALIGN { 
            unsafe { pi_cl_l1_malloc_align(
                self.cluster, 
                layout.size().try_into().map_err(|_| AllocError)?, 
                layout.align()) 
            } as *mut u8
        } else {
            unsafe { pi_cl_l1_malloc(
                self.cluster, 
                layout.size().try_into().map_err(|_| AllocError)?) 
            } as *mut u8
        }

        NonNull::new(ptr)
            .map(|ptr| NonNull::slice_from_raw_parts(ptr, layout.size()))
            .ok_or(AllocError)
    }

    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
        pi_cl_l1_free(
            self.cluster,
            ptr.as_ptr() as *mut cty::c_void,
            layout.size() as i32,
        );
    }
}

Safe allocator with lifetime constraints

Allocated buffer

struct BufAlloc<'a, const BUF_LEN: usize> {
    buf: *mut u8,
    allocator: ClusterAllocator<'a>,
}

Source of data transfer

struct SourcePtr<'a> {
    ptr: *mut u8,
    len: usize,
    _lifetime: PhantomData<&'a u8>,
}

impl<'alloc, const BUF_LEN: usize> BufAlloc<'alloc, BUF_LEN> {
    pub fn new<const CORES: usize>(cluster: &'alloc Cluster<CORES>) -> Self {
        let allocator = cluster.l1_allocator();
        // SAFETY: u8 are always valid, and this will be overwritten before actual use by DMA
        let buf = unsafe { Box::leak(Box::new_uninit_slice_in(BUF_LEN * 3, allocator).assume_init()) };

        Self {
            buf: buf.as_mut_ptr(),
            allocator,
        }
    }
}

DMA Transfer

The DMA unit allows the transfer of data between L2 and cluster L1 memory areas. 8 channels can be programmed. Channels can be 1D/2D on the L2 memory and 1D on the cluster L1 side.

Triple-buffered DMA

The allocated buffer rotates through three roles:

Work buffer (counter = 0): Currently being processed by cores
Pre-fetch buffer (counter = 1): Being loaded with next data chunk
Commit buffer (counter = 2): Being written back to external memory (and low-level cache)

flowchart LR
    A[DmaTransfer] -->|"new_l2()"| B["L2 Backend"];
    A -->|"new_ram()"| C["RAM Backend"];
    D -->|"pi_cl_dma_cmd()"| F["PMSIS Cluster"]
    E -->|"pi_cl_read/write()"| G["PMSIS RAM"]


subgraph B["L2 Backend"]
    direction TB
    D[piClDmaCmd<br>Cluster DMA]
end  

subgraph C["RAM Backend"]
    direction TB
    E[PiClRamReq<br>uDMA]
end

// The buffer sits on L2
struct DmaBuf<'alloc, 'buf, 'source, const CORES: usize, const BUF_LEN: usize> {
    // data source in external memory
    source: SourcePtr<'source>,
    // buffer in L1 cache
    l1_alloc: &'buf BufAlloc<'alloc, BUF_LEN>,
    // how many rounds have been completed till now
    rounds: usize,
    pre_fetch_dma: DmaTransfer,
    commit_dma: DmaTransfer,
    counters: [usize; 3],
    last_transfer: usize,
    work_buf_len: usize,
}

DMA Sequence Diagram

sequenceDiagram
    autonumber
    Application->>+DmaBuf: new_from_l2()
    DmaBuf->>+Core 0: Initial transfer
    participant Core 0
    participant Cores 1-N
    Core 0->>+uDMA Engine: transfer_in()

    External Memory->>uDMA Engine: Load first 2 buffers
    uDMA Engine-->>Core 0: Complete
    Core 0->>Cores 1-N: pi_cl_team_barrier()
    loop Tripple-buffering
        Application ->> DmaBuf: get_work_buf()
        DmaBuf ->> Cores 1-N: Returns work buffer

        Application ->> DmaBuf: advance()
        DmaBuf ->> Core 0: Advance counter
        alt If rounds are left
            Core 0 ->> uDMA Engine: wait() on commit_dma
            Core 0 ->> uDMA Engine: wait() on prefetch_dma
        end

        Core 0 ->> Cores 1-N: pi_cl_team_barrier()

        Core 0 ->> uDMA Engine: transfer_out(commit_buffer)
        uDMA Engine ->> External Memory: Write back

        Core 0 ->> uDMA Engine: transfer_in(prefetch_buffer)
        External Memory ->> uDMA Engine: Load the next chunk

        Core 0 ->> Cores 1-N: pi_cl_team_barrier()
    end

    Application ->> DmaBuf: flush()
    Core 0 ->> uDMA Engine: wait() on final commit
    Core 0 ->> Cores 1-N: pi_cl_team_barrier()