mirror of
https://github.com/goodtft/LCD-show.git
synced 2024-12-11 02:24:19 +01:00
761 lines
29 KiB
C++
761 lines
29 KiB
C++
#ifndef KERNEL_MODULE
|
|
#include <stdio.h> // fprintf, stderr
|
|
#include <stdlib.h> // exit
|
|
#include <memory.h> // memset, memcpy
|
|
#include <inttypes.h> // uint32_t
|
|
#include <syslog.h> // syslog
|
|
#include <sys/mman.h> // mmap, munmap, PROT_READ, PROT_WRITE
|
|
#endif
|
|
|
|
#include "config.h"
|
|
#include "dma.h"
|
|
#include "spi.h"
|
|
#include "gpu.h"
|
|
#include "util.h"
|
|
#include "mailbox.h"
|
|
|
|
#ifdef USE_DMA_TRANSFERS
|
|
|
|
#define BCM2835_PERI_BASE 0x3F000000
|
|
|
|
SharedMemory *dmaSourceMemory = 0;
|
|
volatile DMAChannelRegisterFile *dma0 = 0;
|
|
|
|
volatile DMAChannelRegisterFile *dmaTx = 0;
|
|
volatile DMAChannelRegisterFile *dmaRx = 0;
|
|
int dmaTxChannel = -1;
|
|
int dmaTxIrq = 0;
|
|
int dmaRxChannel = -1;
|
|
int dmaRxIrq = 0;
|
|
|
|
#define PAGE_SIZE 4096
|
|
|
|
struct GpuMemory
|
|
{
|
|
uint32_t allocationHandle;
|
|
void *virtualAddr;
|
|
uintptr_t busAddress;
|
|
uint32_t sizeBytes;
|
|
};
|
|
|
|
#define NUM_DMA_CBS 1024
|
|
GpuMemory dmaCb, dmaSourceBuffer, dmaConstantData;
|
|
|
|
volatile DMAControlBlock *dmaSendTail = 0;
|
|
volatile DMAControlBlock *dmaRecvTail = 0;
|
|
volatile DMAControlBlock *firstFreeCB = 0;
|
|
volatile uint8_t *dmaSourceEnd = 0;
|
|
|
|
volatile DMAControlBlock *GrabFreeCBs(int num)
|
|
{
|
|
volatile DMAControlBlock *firstCB = (volatile DMAControlBlock *)dmaCb.virtualAddr;
|
|
volatile DMAControlBlock *endCB = firstCB + NUM_DMA_CBS;
|
|
if ((uintptr_t)(firstFreeCB + num) >= (uintptr_t)dmaCb.virtualAddr + dmaCb.sizeBytes)
|
|
{
|
|
WaitForDMAFinished();
|
|
firstFreeCB = firstCB;
|
|
}
|
|
|
|
volatile DMAControlBlock *ret = firstFreeCB;
|
|
firstFreeCB += num;
|
|
return ret;
|
|
}
|
|
|
|
volatile uint8_t *GrabFreeDMASourceBytes(int bytes)
|
|
{
|
|
if ((uintptr_t)dmaSourceEnd + bytes >= (uintptr_t)dmaSourceBuffer.virtualAddr + dmaSourceBuffer.sizeBytes)
|
|
{
|
|
WaitForDMAFinished();
|
|
dmaSourceEnd = (volatile uint8_t *)dmaSourceBuffer.virtualAddr;
|
|
}
|
|
|
|
volatile uint8_t *ret = dmaSourceEnd;
|
|
dmaSourceEnd += bytes;
|
|
return ret;
|
|
}
|
|
|
|
static int AllocateDMAChannel(int *dmaChannel, int *irq)
|
|
{
|
|
// Snooping DMA, channels 3, 5 and 6 seen active.
|
|
// TODO: Actually reserve the DMA channel to the system using bcm_dma_chan_alloc() and bcm_dma_chan_free()?...
|
|
// Right now, use channels 1 and 4 which seem to be free.
|
|
// Note: The send channel could be a lite channel, but receive channel cannot, since receiving uses the IGNORE flag
|
|
// that lite DMA engines don't have.
|
|
#ifdef FREEPLAYTECH_WAVESHARE32B
|
|
// On FreePlayTech Zero, DMA channel 4 seen to be taken by SD HOST (peripheral mapping 13).
|
|
int freeChannels[] = { 5, 1 };
|
|
#else
|
|
int freeChannels[] = { 7, 1 };
|
|
#endif
|
|
#if defined(DMA_TX_CHANNEL)
|
|
freeChannels[0] = DMA_TX_CHANNEL;
|
|
#endif
|
|
#if defined(DMA_RX_CHANNEL)
|
|
freeChannels[1] = DMA_RX_CHANNEL;
|
|
#endif
|
|
if (freeChannels[0] == freeChannels[1]) FATAL_ERROR("DMA TX and RX channels cannot be the same channel!");
|
|
|
|
static int nextFreeChannel = 0;
|
|
if (nextFreeChannel >= sizeof(freeChannels) / sizeof(freeChannels[0])) FATAL_ERROR("No free DMA channels");
|
|
|
|
*dmaChannel = freeChannels[nextFreeChannel++];
|
|
LOG("Allocated DMA channel %d", *dmaChannel);
|
|
*irq = 0;
|
|
return 0;
|
|
}
|
|
|
|
void FreeDMAChannel(int channel)
|
|
{
|
|
volatile DMAChannelRegisterFile *dma = GetDMAChannel(channel);
|
|
dma->cb.ti = 0; // Clear the SPI TX & RX permaps for this DMA channel so that we don't think some other program is using these for SPI
|
|
}
|
|
|
|
// Message IDs for different mailbox GPU memory allocation messages
|
|
#define MEM_ALLOC_MESSAGE 0x3000c // This message is 3 u32s: numBytes, alignment and flags
|
|
#define MEM_FREE_MESSAGE 0x3000f // This message is 1 u32: handle
|
|
#define MEM_LOCK_MESSAGE 0x3000d // 1 u32: handle
|
|
#define MEM_UNLOCK_MESSAGE 0x3000e // 1 u32: handle
|
|
|
|
// Memory allocation flags
|
|
#define MEM_ALLOC_FLAG_DIRECT (1 << 2) // Allocate uncached memory that bypasses L1 and L2 cache on loads and stores
|
|
#define MEM_ALLOC_FLAG_COHERENT (1 << 3) // Non-allocating in L2 but coherent
|
|
|
|
#define BUS_TO_PHYS(x) ((x) & ~0xC0000000)
|
|
|
|
#define PHYS_TO_BUS(x) ((x) | 0xC0000000)
|
|
|
|
#define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress)
|
|
|
|
uint64_t totalGpuMemoryUsed = 0;
|
|
|
|
// Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block.
|
|
// The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use
|
|
// this kind of memory to pass data blocks over to the DMA controller to process.
|
|
GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason)
|
|
{
|
|
GpuMemory mem;
|
|
mem.sizeBytes = ALIGN_UP(numBytes, PAGE_SIZE);
|
|
uint32_t allocationFlags = MEM_ALLOC_FLAG_DIRECT | MEM_ALLOC_FLAG_COHERENT;
|
|
mem.allocationHandle = Mailbox(MEM_ALLOC_MESSAGE, /*size=*/mem.sizeBytes, /*alignment=*/PAGE_SIZE, /*flags=*/allocationFlags);
|
|
if (!mem.allocationHandle) FATAL_ERROR("Failed to allocate GPU memory! Try increasing gpu_mem allocation in /boot/config.txt. See https://www.raspberrypi.org/documentation/configuration/config-txt/memory.md");
|
|
mem.busAddress = Mailbox(MEM_LOCK_MESSAGE, mem.allocationHandle);
|
|
if (!mem.busAddress) FATAL_ERROR("Failed to lock GPU memory!");
|
|
mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, BUS_TO_PHYS(mem.busAddress));
|
|
if (mem.virtualAddr == MAP_FAILED) FATAL_ERROR("Failed to mmap GPU memory!");
|
|
totalGpuMemoryUsed += mem.sizeBytes;
|
|
// printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed);
|
|
return mem;
|
|
}
|
|
|
|
void FreeUncachedGpuMemory(GpuMemory mem)
|
|
{
|
|
totalGpuMemoryUsed -= mem.sizeBytes;
|
|
munmap(mem.virtualAddr, mem.sizeBytes);
|
|
Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle);
|
|
Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle);
|
|
}
|
|
|
|
volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber)
|
|
{
|
|
if (channelNumber < 0 || channelNumber >= BCM2835_NUM_DMA_CHANNELS)
|
|
{
|
|
printf("Invalid DMA channel %d specified!\n", channelNumber);
|
|
FATAL_ERROR("Invalid DMA channel specified!");
|
|
}
|
|
return dma0 + channelNumber;
|
|
}
|
|
|
|
void DumpDMAPeripheralMap()
|
|
{
|
|
for(int i = 0; i < BCM2835_NUM_DMA_CHANNELS; ++i)
|
|
{
|
|
volatile DMAChannelRegisterFile *channel = GetDMAChannel(i);
|
|
printf("DMA channel %d has peripheral map %d (is lite channel: %d, currently active: %d, current control block: %p)\n", i, (channel->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT, (channel->cb.debug & BCM2835_DMA_DEBUG_LITE) ? 1 : 0, (channel->cs & BCM2835_DMA_CS_ACTIVE) ? 1 : 0, channel->cbAddr);
|
|
}
|
|
}
|
|
|
|
// Verifies that no other program has stomped on the DMA channel that we are using.
|
|
void CheckDMAChannelNotStolen(int channelNumber, int expectedPeripheralMap)
|
|
{
|
|
volatile DMAChannelRegisterFile *channel = GetDMAChannel(channelNumber);
|
|
uint32_t peripheralMap = ((channel->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT);
|
|
if (peripheralMap != expectedPeripheralMap && peripheralMap != 0)
|
|
{
|
|
DumpDMAPeripheralMap();
|
|
printf("DMA channel collision! DMA channel %d was expected to be assigned to our peripheral %d, but something else has assigned it to peripheral %d!\n", channelNumber, expectedPeripheralMap, peripheralMap);
|
|
FATAL_ERROR("System is likely unstable now, rebooting is advised.");
|
|
}
|
|
uint32_t cbAddr = channel->cbAddr;
|
|
if (cbAddr && (cbAddr < dmaCb.busAddress || cbAddr >= dmaCb.busAddress + dmaCb.sizeBytes))
|
|
{
|
|
DumpDMAPeripheralMap();
|
|
printf("DMA channel collision! Some other program has submitted a DMA task to our DMA channel %d! (DMA task at unknown control block address %p)\n", channelNumber, cbAddr);
|
|
FATAL_ERROR("System is likely unstable now, rebooting is advised.");
|
|
}
|
|
}
|
|
|
|
void CheckSPIDMAChannelsNotStolen()
|
|
{
|
|
CheckDMAChannelNotStolen(dmaTxChannel, BCM2835_DMA_TI_PERMAP_SPI_TX);
|
|
CheckDMAChannelNotStolen(dmaRxChannel, BCM2835_DMA_TI_PERMAP_SPI_RX);
|
|
}
|
|
|
|
void ResetDMAChannels()
|
|
{
|
|
dmaTx->cs = BCM2835_DMA_CS_RESET;
|
|
dmaTx->cb.debug = BCM2835_DMA_DEBUG_DMA_READ_ERROR | BCM2835_DMA_DEBUG_DMA_FIFO_ERROR | BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR;
|
|
dmaRx->cs = BCM2835_DMA_CS_RESET;
|
|
dmaRx->cb.debug = BCM2835_DMA_DEBUG_DMA_READ_ERROR | BCM2835_DMA_DEBUG_DMA_FIFO_ERROR | BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR;
|
|
}
|
|
|
|
int InitDMA()
|
|
{
|
|
#if defined(KERNEL_MODULE)
|
|
dma0 = (volatile DMAChannelRegisterFile*)ioremap(BCM2835_PERI_BASE+BCM2835_DMA0_OFFSET, BCM2835_NUM_DMA_CHANNELS*0x100);
|
|
#else
|
|
dma0 = (volatile DMAChannelRegisterFile*)((uintptr_t)bcm2835 + BCM2835_DMA0_OFFSET);
|
|
#endif
|
|
|
|
#ifdef KERNEL_MODULE_CLIENT
|
|
dmaTxChannel = spiTaskMemory->dmaTxChannel;
|
|
dmaRxChannel = spiTaskMemory->dmaRxChannel;
|
|
#else
|
|
int ret = AllocateDMAChannel(&dmaTxChannel, &dmaTxIrq);
|
|
if (ret != 0) FATAL_ERROR("Unable to allocate TX DMA channel!");
|
|
ret = AllocateDMAChannel(&dmaRxChannel, &dmaRxIrq);
|
|
if (ret != 0) FATAL_ERROR("Unable to allocate RX DMA channel!");
|
|
|
|
printf("Enabling DMA channels Tx:%d and Rx:%d\n", dmaTxChannel, dmaRxChannel);
|
|
volatile uint32_t *dmaEnableRegister = (volatile uint32_t *)((uintptr_t)dma0 + BCM2835_DMAENABLE_REGISTER_OFFSET);
|
|
|
|
// Enable the allocated DMA channels
|
|
*dmaEnableRegister |= (1 << dmaTxChannel);
|
|
*dmaEnableRegister |= (1 << dmaRxChannel);
|
|
#endif
|
|
|
|
#if !defined(KERNEL_MODULE)
|
|
dmaCb = AllocateUncachedGpuMemory(sizeof(DMAControlBlock) * NUM_DMA_CBS, "DMA control blocks");
|
|
memset(dmaCb.virtualAddr, 0, dmaCb.sizeBytes); // Some fields of the CBs (debug, reserved) are initialized to zero and assumed to stay so throughout app lifetime.
|
|
firstFreeCB = (volatile DMAControlBlock *)dmaCb.virtualAddr;
|
|
|
|
dmaSourceBuffer = AllocateUncachedGpuMemory(SHARED_MEMORY_SIZE*2, "DMA source data");
|
|
dmaSourceEnd = (volatile uint8_t *)dmaSourceBuffer.virtualAddr;
|
|
|
|
dmaConstantData = AllocateUncachedGpuMemory(2*sizeof(uint32_t), "DMA constant data");
|
|
uint32_t *constantData = (uint32_t *)dmaConstantData.virtualAddr;
|
|
constantData[0] = BCM2835_SPI0_CS_DMAEN; // constantData[0] is for disableTransferActive task
|
|
constantData[1] = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END; // constantData[1] is for startDMATxChannel task
|
|
#endif
|
|
|
|
LOG("DMA hardware register file is at ptr: %p, using DMA TX channel: %d and DMA RX channel: %d", dma0, dmaTxChannel, dmaRxChannel);
|
|
if (!dma0) FATAL_ERROR("Failed to map DMA!");
|
|
|
|
dmaTx = GetDMAChannel(dmaTxChannel);
|
|
dmaRx = GetDMAChannel(dmaRxChannel);
|
|
LOG("DMA hardware TX channel register file is at ptr: %p, DMA RX channel register file is at ptr: %p", dmaTx, dmaRx);
|
|
int dmaTxPeripheralMap = (dmaTx->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT;
|
|
if (dmaTxPeripheralMap != 0 && dmaTxPeripheralMap != BCM2835_DMA_TI_PERMAP_SPI_TX)
|
|
{
|
|
DumpDMAPeripheralMap();
|
|
LOG("DMA TX channel %d was assigned another peripheral map %d!", dmaTxChannel, dmaTxPeripheralMap);
|
|
FATAL_ERROR("DMA TX channel was assigned another peripheral map!");
|
|
}
|
|
if (dmaTx->cbAddr != 0 && (dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
|
|
FATAL_ERROR("DMA TX channel was in use!");
|
|
|
|
int dmaRxPeripheralMap = (dmaRx->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT;
|
|
if (dmaRxPeripheralMap != 0 && dmaRxPeripheralMap != BCM2835_DMA_TI_PERMAP_SPI_RX)
|
|
{
|
|
LOG("DMA RX channel %d was assigned another peripheral map %d!", dmaRxChannel, dmaRxPeripheralMap);
|
|
DumpDMAPeripheralMap();
|
|
FATAL_ERROR("DMA RX channel was assigned another peripheral map!");
|
|
}
|
|
if (dmaRx->cbAddr != 0 && (dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
|
|
FATAL_ERROR("DMA RX channel was in use!");
|
|
|
|
if ((dmaRx->cb.debug & BCM2835_DMA_DEBUG_LITE) != 0)
|
|
FATAL_ERROR("DMA RX channel cannot be a lite channel, because to get best performance we want to use BCM2835_DMA_TI_DEST_IGNORE DMA operation mode that lite DMA channels do not have. (Try using DMA RX channel value < 7)");
|
|
|
|
LOG("Resetting DMA channels for use");
|
|
ResetDMAChannels();
|
|
|
|
// TODO: Set up IRQ
|
|
LOG("DMA all set up");
|
|
return 0;
|
|
}
|
|
|
|
// Debugging functions to introspect SPI and DMA hardware registers:
|
|
|
|
void DumpCS(uint32_t reg)
|
|
{
|
|
PRINT_FLAG(BCM2835_DMA_CS_RESET);
|
|
PRINT_FLAG(BCM2835_DMA_CS_ABORT);
|
|
PRINT_FLAG(BCM2835_DMA_CS_DISDEBUG);
|
|
PRINT_FLAG(BCM2835_DMA_CS_WAIT_FOR_OUTSTANDING_WRITES);
|
|
PRINT_FLAG(BCM2835_DMA_CS_PANIC_PRIORITY);
|
|
PRINT_FLAG(BCM2835_DMA_CS_PRIORITY);
|
|
PRINT_FLAG(BCM2835_DMA_CS_ERROR);
|
|
PRINT_FLAG(BCM2835_DMA_CS_WAITING_FOR_OUTSTANDING_WRITES);
|
|
PRINT_FLAG(BCM2835_DMA_CS_DREQ_STOPS_DMA);
|
|
PRINT_FLAG(BCM2835_DMA_CS_PAUSED);
|
|
PRINT_FLAG(BCM2835_DMA_CS_DREQ);
|
|
PRINT_FLAG(BCM2835_DMA_CS_INT);
|
|
PRINT_FLAG(BCM2835_DMA_CS_END);
|
|
PRINT_FLAG(BCM2835_DMA_CS_ACTIVE);
|
|
}
|
|
|
|
void DumpDebug(uint32_t reg)
|
|
{
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_LITE);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_VERSION);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_STATE);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_ID);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_OUTSTANDING_WRITES);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_READ_ERROR);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_FIFO_ERROR);
|
|
PRINT_FLAG(BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR);
|
|
}
|
|
|
|
void DumpTI(uint32_t reg)
|
|
{
|
|
PRINT_FLAG(BCM2835_DMA_TI_NO_WIDE_BURSTS);
|
|
PRINT_FLAG(BCM2835_DMA_TI_WAITS);
|
|
#define BCM2835_DMA_TI_PERMAP_MASK_SHIFT 16
|
|
PRINT_FLAG(BCM2835_DMA_TI_PERMAP_MASK);
|
|
// PRINT_FLAG(BCM2835_DMA_TI_BURST_LENGTH);
|
|
PRINT_FLAG(BCM2835_DMA_TI_SRC_IGNORE);
|
|
PRINT_FLAG(BCM2835_DMA_TI_SRC_DREQ);
|
|
PRINT_FLAG(BCM2835_DMA_TI_SRC_WIDTH);
|
|
PRINT_FLAG(BCM2835_DMA_TI_SRC_INC);
|
|
PRINT_FLAG(BCM2835_DMA_TI_DEST_IGNORE);
|
|
PRINT_FLAG(BCM2835_DMA_TI_DEST_DREQ);
|
|
PRINT_FLAG(BCM2835_DMA_TI_DEST_WIDTH);
|
|
PRINT_FLAG(BCM2835_DMA_TI_DEST_INC);
|
|
PRINT_FLAG(BCM2835_DMA_TI_WAIT_RESP);
|
|
PRINT_FLAG(BCM2835_DMA_TI_TDMODE);
|
|
PRINT_FLAG(BCM2835_DMA_TI_INTEN);
|
|
}
|
|
|
|
#define DMA_DMA0_CB_PHYS_ADDRESS 0x7E007000
|
|
|
|
#define DMA_SPI_CS_PHYS_ADDRESS 0x7E204000
|
|
#define DMA_SPI_FIFO_PHYS_ADDRESS 0x7E204004
|
|
#define DMA_SPI_DLEN_PHYS_ADDRESS 0x7E20400C
|
|
#define DMA_GPIO_SET_PHYS_ADDRESS 0x7E20001C
|
|
#define DMA_GPIO_CLEAR_PHYS_ADDRESS 0x7E200028
|
|
|
|
void DumpDMAState()
|
|
{
|
|
printf("---SPI:---\n");
|
|
DumpSPICS(spi->cs);
|
|
printf("---DMATX CS:---\n");
|
|
DumpCS(dmaTx->cs);
|
|
printf("---DMATX TI:---\n");
|
|
DumpTI(dmaTx->cb.ti);
|
|
printf("---DMATX DEBUG:---\n");
|
|
DumpDebug(dmaTx->cb.debug);
|
|
printf("****** DMATX cbAddr: %p\n", dmaTx->cbAddr);
|
|
|
|
printf("---DMARX CS:---\n");
|
|
DumpCS(dmaRx->cs);
|
|
printf("---DMARX TI:---\n");
|
|
DumpTI(dmaRx->cb.ti);
|
|
printf("---DMARX DEBUG:---\n");
|
|
DumpDebug(dmaRx->cb.debug);
|
|
printf("****** DMARX cbAddr: %p\n", dmaRx->cbAddr);
|
|
}
|
|
|
|
extern volatile bool programRunning;
|
|
|
|
void WaitForDMAFinished()
|
|
{
|
|
int spins = 0;
|
|
uint64_t t0 = tick();
|
|
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
|
|
{
|
|
usleep(100);
|
|
if (tick() - t0 > 2000000)
|
|
{
|
|
printf("TX stalled\n");
|
|
DumpDMAState();
|
|
exit(1);
|
|
}
|
|
}
|
|
spins = 0;
|
|
t0 = tick();
|
|
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
|
|
{
|
|
usleep(100);
|
|
if (tick() - t0 > 2000000)
|
|
{
|
|
printf("RX stalled\n");
|
|
DumpDMAState();
|
|
exit(1);
|
|
}
|
|
}
|
|
dmaSendTail = 0;
|
|
dmaRecvTail = 0;
|
|
}
|
|
|
|
#ifdef ALL_TASKS_SHOULD_DMA
|
|
|
|
// This function does a memcpy from one source buffer to two destination buffers simultaneously.
|
|
// It saves a lot of time on ARMv6 by avoiding to have to do two separate memory copies, because the ARMv6 L1 cache is so tiny (4K) that it cannot fit a whole framebuffer
|
|
// in memory at a time. Streaming through it only once instead of twice helps memory bandwidth immensely, this is profiled to be ~4x faster than a pair of memcpys or a simple CPU loop.
|
|
// In addition, this does a little endian->big endian conversion when copying data out to dstDma.
|
|
static void memcpy_to_dma_and_prev_framebuffer(uint16_t *dstDma, uint16_t **dstPrevFramebuffer, uint16_t **srcFramebuffer, int numBytes, int *taskStartX, int width, int stride)
|
|
{
|
|
int strideEnd = stride - width*2;
|
|
int xLeft = width-*taskStartX;
|
|
|
|
uint16_t *Src = *srcFramebuffer;
|
|
uint16_t *Dst1 = *dstPrevFramebuffer;
|
|
|
|
// TODO: Do the loops in aligned order with unaligned head and tail separate, and ensure that dstDma, dstPrevFramebuffer and srcFramebuffer are in same alignment phase.
|
|
asm volatile(
|
|
"start_%=:\n"
|
|
"ldrd r0, r1, [%[srcFramebuffer]], #8\n"
|
|
"pld [%[srcFramebuffer], #248]\n"
|
|
"strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
|
|
"rev16 r0, r0\n"
|
|
"rev16 r1, r1\n"
|
|
"strd r0, r1, [%[dstDma]], #8\n"
|
|
|
|
"ldrd r0, r1, [%[srcFramebuffer]], #8\n"
|
|
"strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
|
|
"rev16 r0, r0\n"
|
|
"rev16 r1, r1\n"
|
|
"strd r0, r1, [%[dstDma]], #8\n"
|
|
|
|
"ldrd r0, r1, [%[srcFramebuffer]], #8\n"
|
|
"strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
|
|
"rev16 r0, r0\n"
|
|
"rev16 r1, r1\n"
|
|
"strd r0, r1, [%[dstDma]], #8\n"
|
|
|
|
"ldrd r0, r1, [%[srcFramebuffer]], #8\n"
|
|
"strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
|
|
"rev16 r0, r0\n"
|
|
"rev16 r1, r1\n"
|
|
"strd r0, r1, [%[dstDma]], #8\n"
|
|
|
|
"subs %[xLeft], %[xLeft], #16\n"
|
|
"addls %[xLeft], %[xLeft], %[width]\n"
|
|
"addls %[dstPrevFramebuffer], %[dstPrevFramebuffer], %[strideEnd]\n"
|
|
"addls %[srcFramebuffer], %[srcFramebuffer], %[strideEnd]\n"
|
|
|
|
"subs %[numBytes], %[numBytes], #32\n"
|
|
"bhi start_%=\n"
|
|
|
|
: [dstDma]"+r"(dstDma), [dstPrevFramebuffer]"+r"(Dst1), [srcFramebuffer]"+r"(Src), [xLeft]"+r"(xLeft), [numBytes]"+r"(numBytes)
|
|
: [strideEnd]"r"(strideEnd), [width]"r"(width)
|
|
: "r0", "r1", "memory", "cc"
|
|
);
|
|
*taskStartX = width - xLeft;
|
|
*srcFramebuffer = Src;
|
|
*dstPrevFramebuffer = Dst1;
|
|
}
|
|
|
|
static void memcpy_to_dma_and_prev_framebuffer_in_c(uint16_t *dstDma, uint16_t **dstPrevFramebuffer, uint16_t **srcFramebuffer, int numBytes, int *taskStartX, int width, int stride)
|
|
{
|
|
static bool performanceWarningPrinted = false;
|
|
if (!performanceWarningPrinted)
|
|
{
|
|
printf("Performance warning: using slow memcpy_to_dma_and_prev_framebuffer_in_c() function. Check conditions in display.h that enable OFFLOAD_PIXEL_COPY_TO_DMA_CPP and configure to use that instead.\n");
|
|
performanceWarningPrinted = true;
|
|
}
|
|
int numPixels = numBytes>>1;
|
|
int endStridePixels = (stride>>1) - width;
|
|
uint16_t *prevData = *dstPrevFramebuffer;
|
|
uint16_t *data = *srcFramebuffer;
|
|
for(int i = 0; i < numPixels; ++i)
|
|
{
|
|
*prevData++ = *data;
|
|
dstDma[i] = __builtin_bswap16(*data++);
|
|
if (++*taskStartX >= width)
|
|
{
|
|
*taskStartX = 0;
|
|
data += endStridePixels;
|
|
prevData += endStridePixels;
|
|
}
|
|
}
|
|
*srcFramebuffer = data;
|
|
*dstPrevFramebuffer = prevData;
|
|
}
|
|
|
|
#if defined(ALL_TASKS_SHOULD_DMA) && defined(SPI_3WIRE_PROTOCOL)
|
|
// Bug: there is something about the chained DMA transfer mechanism that makes write window coordinate set commands not go through properly
|
|
// on 3-wire displays, but do not yet know what. (Remove this #error statement to debug)
|
|
#error ALL_TASKS_SHOULD_DMA and SPI_3WIRE_PROTOCOL are currently not mutually compatible!
|
|
#endif
|
|
|
|
#if defined(OFFLOAD_PIXEL_COPY_TO_DMA_CPP) && defined(SPI_3WIRE_PROTOCOL)
|
|
// We would have to convert 8-bit tasks to 9-bit tasks immediately after offloaded memcpy has been done below to implement this.
|
|
#error OFFLOAD_PIXEL_COPY_TO_DMA_CPP and SPI_3WIRE_PROTOCOL are not mutually compatible!
|
|
#endif
|
|
|
|
void SPIDMATransfer(SPITask *task)
|
|
{
|
|
// There is a limit to how many bytes can be sent in one DMA-based SPI task, so if the task
|
|
// is larger than this, we'll split the send into multiple individual DMA SPI transfers
|
|
// and chain them together. This should be a multiple of 32 bytes to keep tasks cache aligned on ARMv6.
|
|
#define MAX_DMA_SPI_TASK_SIZE 65504
|
|
|
|
const int numDMASendTasks = (task->PayloadSize() + MAX_DMA_SPI_TASK_SIZE - 1) / MAX_DMA_SPI_TASK_SIZE;
|
|
|
|
volatile uint32_t *dmaData = (volatile uint32_t *)GrabFreeDMASourceBytes(4*(numDMASendTasks-1)+4*numDMASendTasks+task->PayloadSize());
|
|
volatile uint32_t *setDMATxAddressData = dmaData;
|
|
volatile uint32_t *txData = dmaData+numDMASendTasks-1;
|
|
|
|
volatile DMAControlBlock *cb = GrabFreeCBs(numDMASendTasks*5-3);
|
|
|
|
volatile DMAControlBlock *rxTail = 0;
|
|
volatile DMAControlBlock *tx0 = &cb[0];
|
|
volatile DMAControlBlock *rx0 = &cb[1];
|
|
|
|
#ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
|
|
uint8_t *data = task->fb;
|
|
uint8_t *prevData = task->prevFb;
|
|
const bool taskAndFramebufferSizesCompatibleWithTightMemcpy = (task->PayloadSize() % 32 == 0) && (task->width % 16 == 0);
|
|
#else
|
|
uint8_t *data = task->PayloadStart();
|
|
#endif
|
|
|
|
int bytesLeft = task->PayloadSize();
|
|
int taskStartX = 0;
|
|
|
|
while(bytesLeft > 0)
|
|
{
|
|
int sendSize = MIN(bytesLeft, MAX_DMA_SPI_TASK_SIZE);
|
|
bytesLeft -= sendSize;
|
|
|
|
volatile DMAControlBlock *tx = cb++;
|
|
txData[0] = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (sendSize << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.
|
|
// This is really sad: we must do a memcpy to prepare for DMA controller to be able to do a memcpy. The reason for this is that the DMA source memory area must be in cache bypassing
|
|
// region of memory, which the SPI source ring buffer is not. It could be allocated to be so however, but bypassing the caches on the SPI ring buffer would cause a massive -51.5%
|
|
// profiled overall performance drop (tested on Pi3B+ and Tontec 3.5" 480x320 display on gpu test pattern, see branch non_intermediate_memcpy_for_dma). Therefore just keep doing
|
|
// this memcpy() to prepare for DMA to do its memcpy(), as it is faster overall. (If there was a way to map same physical memory to virtual address space twice, once cached, and
|
|
// another time uncached, and have writes bypass the cache and only write combine, but have reads follow the cache, then it might work without a perf hit, but not at all sure if
|
|
// that would be technically possible)
|
|
uint16_t *txPtr = (uint16_t*)(txData+1);
|
|
|
|
// If task->prevFb is present, the DMA backend is responsible for streaming pixel data from current framebuffer to old framebuffer, and the DMA task buffer.
|
|
// If not present, then that preparation has been already done by the caller.
|
|
#ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
|
|
if (prevData)
|
|
{
|
|
// For 2D pixel data, do a "everything in one pass"
|
|
if (taskAndFramebufferSizesCompatibleWithTightMemcpy)
|
|
memcpy_to_dma_and_prev_framebuffer((uint16_t*)txPtr, (uint16_t**)&prevData, (uint16_t**)&data, sendSize, &taskStartX, task->width, gpuFramebufferScanlineStrideBytes);
|
|
else
|
|
memcpy_to_dma_and_prev_framebuffer_in_c((uint16_t*)txPtr, (uint16_t**)&prevData, (uint16_t**)&data, sendSize, &taskStartX, task->width, gpuFramebufferScanlineStrideBytes);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
memcpy(txPtr, data, sendSize);
|
|
data += sendSize;
|
|
}
|
|
|
|
tx->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_TX) | BCM2835_DMA_TI_DEST_DREQ | BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_WAIT_RESP;
|
|
tx->src = VIRT_TO_BUS(dmaSourceBuffer, txData);
|
|
tx->dst = DMA_SPI_FIFO_PHYS_ADDRESS; // Write out to the SPI peripheral
|
|
tx->len = 4+sendSize;
|
|
tx->next = 0;
|
|
txData += 1+sendSize/4;
|
|
|
|
volatile DMAControlBlock *rx = cb++;
|
|
rx->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_RX) | BCM2835_DMA_TI_SRC_DREQ | BCM2835_DMA_TI_DEST_IGNORE;
|
|
rx->src = DMA_SPI_FIFO_PHYS_ADDRESS;
|
|
rx->dst = 0;
|
|
rx->len = sendSize;
|
|
rx->next = 0;
|
|
|
|
if (rxTail)
|
|
{
|
|
volatile DMAControlBlock *setDMATxAddress = cb++;
|
|
volatile DMAControlBlock *disableTransferActive = cb++;
|
|
volatile DMAControlBlock *startDMATxChannel = cb++;
|
|
|
|
rxTail->next = VIRT_TO_BUS(dmaCb, setDMATxAddress);
|
|
|
|
setDMATxAddressData[0] = VIRT_TO_BUS(dmaCb, tx);
|
|
setDMATxAddress->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
|
|
setDMATxAddress->src = VIRT_TO_BUS(dmaSourceBuffer, setDMATxAddressData);
|
|
setDMATxAddress->dst = DMA_DMA0_CB_PHYS_ADDRESS + dmaTxChannel*0x100 + 4;
|
|
setDMATxAddress->len = 4;
|
|
setDMATxAddress->next = VIRT_TO_BUS(dmaCb, disableTransferActive);
|
|
++setDMATxAddressData;
|
|
|
|
disableTransferActive->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
|
|
disableTransferActive->src = dmaConstantData.busAddress;
|
|
disableTransferActive->dst = DMA_SPI_CS_PHYS_ADDRESS;
|
|
disableTransferActive->len = 4;
|
|
disableTransferActive->next = VIRT_TO_BUS(dmaCb, startDMATxChannel);
|
|
|
|
startDMATxChannel->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
|
|
startDMATxChannel->src = dmaConstantData.busAddress+4;
|
|
startDMATxChannel->dst = DMA_DMA0_CB_PHYS_ADDRESS + dmaTxChannel*0x100;
|
|
startDMATxChannel->len = 4;
|
|
startDMATxChannel->next = VIRT_TO_BUS(dmaCb, rx);
|
|
|
|
}
|
|
rxTail = rx;
|
|
}
|
|
|
|
static uint64_t taskStartTime = 0;
|
|
static int pendingTaskBytes = 1;
|
|
double pendingTaskUSecs = pendingTaskBytes * spiUsecsPerByte;
|
|
pendingTaskUSecs -= tick() - taskStartTime;
|
|
if (pendingTaskUSecs > 70)
|
|
usleep(pendingTaskUSecs-70);
|
|
|
|
uint64_t dmaTaskStart = tick();
|
|
|
|
CheckSPIDMAChannelsNotStolen();
|
|
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
|
|
{
|
|
usleep(250);
|
|
CheckSPIDMAChannelsNotStolen();
|
|
if (tick() - dmaTaskStart > 5000000)
|
|
{
|
|
DumpDMAState();
|
|
FATAL_ERROR("DMA TX channel has stalled!");
|
|
}
|
|
}
|
|
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
|
|
{
|
|
usleep(250);
|
|
CheckSPIDMAChannelsNotStolen();
|
|
if (tick() - dmaTaskStart > 5000000)
|
|
{
|
|
DumpDMAState();
|
|
FATAL_ERROR("DMA RX channel has stalled!");
|
|
}
|
|
}
|
|
if (!programRunning) return;
|
|
|
|
pendingTaskBytes = task->PayloadSize();
|
|
|
|
// First send the SPI command byte in Polled SPI mode
|
|
spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
|
|
#ifndef SPI_3WIRE_PROTOCOL
|
|
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
|
|
spi->fifo = 0;
|
|
spi->fifo = task->cmd;
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
// spi->fifo; // Currently no need to flush these, the clear below clears the rx queue.
|
|
// spi->fifo;
|
|
#else
|
|
spi->fifo = task->cmd;
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
// spi->fifo; // Currently no need to flush this, the clear below clears the rx queue.
|
|
#endif
|
|
|
|
SET_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
#endif
|
|
|
|
spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
|
|
dmaTx->cbAddr = VIRT_TO_BUS(dmaCb, tx0);
|
|
dmaRx->cbAddr = VIRT_TO_BUS(dmaCb, rx0);
|
|
__sync_synchronize();
|
|
dmaTx->cs = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END;
|
|
dmaRx->cs = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END;
|
|
taskStartTime = tick();
|
|
}
|
|
|
|
#else
|
|
|
|
void SPIDMATransfer(SPITask *task)
|
|
{
|
|
// Transition the SPI peripheral to enable the use of DMA
|
|
spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
uint32_t *headerAddr = task->DmaSpiHeaderAddress();
|
|
*headerAddr = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (task->PayloadSize() << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.
|
|
|
|
// TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer from 'task' pointer. However
|
|
// that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
|
|
// Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to an intermediate 'dmaSourceMemory' memory area to perform
|
|
// the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
|
|
memcpy(dmaSourceBuffer.virtualAddr, headerAddr, task->PayloadSize() + 4);
|
|
|
|
volatile DMAControlBlock *cb = (volatile DMAControlBlock *)dmaCb.virtualAddr;
|
|
volatile DMAControlBlock *txcb = &cb[0];
|
|
txcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_TX) | BCM2835_DMA_TI_DEST_DREQ | BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_WAIT_RESP;
|
|
txcb->src = dmaSourceBuffer.busAddress;
|
|
txcb->dst = DMA_SPI_FIFO_PHYS_ADDRESS; // Write out to the SPI peripheral
|
|
txcb->len = task->PayloadSize() + 4;
|
|
txcb->stride = 0;
|
|
txcb->next = 0;
|
|
txcb->debug = 0;
|
|
txcb->reserved = 0;
|
|
dmaTx->cbAddr = dmaCb.busAddress;
|
|
|
|
volatile DMAControlBlock *rxcb = &cb[1];
|
|
rxcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_RX) | BCM2835_DMA_TI_SRC_DREQ | BCM2835_DMA_TI_DEST_IGNORE;
|
|
rxcb->src = DMA_SPI_FIFO_PHYS_ADDRESS;
|
|
rxcb->dst = 0;
|
|
rxcb->len = task->PayloadSize();
|
|
rxcb->stride = 0;
|
|
rxcb->next = 0;
|
|
rxcb->debug = 0;
|
|
rxcb->reserved = 0;
|
|
dmaRx->cbAddr = dmaCb.busAddress + sizeof(DMAControlBlock);
|
|
|
|
__sync_synchronize();
|
|
dmaTx->cs = BCM2835_DMA_CS_ACTIVE;
|
|
dmaRx->cs = BCM2835_DMA_CS_ACTIVE;
|
|
__sync_synchronize();
|
|
|
|
double pendingTaskUSecs = task->PayloadSize() * spiUsecsPerByte;
|
|
if (pendingTaskUSecs > 70)
|
|
usleep(pendingTaskUSecs-70);
|
|
|
|
uint64_t dmaTaskStart = tick();
|
|
|
|
CheckSPIDMAChannelsNotStolen();
|
|
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
|
|
{
|
|
CheckSPIDMAChannelsNotStolen();
|
|
if (tick() - dmaTaskStart > 5000000)
|
|
FATAL_ERROR("DMA TX channel has stalled!");
|
|
}
|
|
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
|
|
{
|
|
CheckSPIDMAChannelsNotStolen();
|
|
if (tick() - dmaTaskStart > 5000000)
|
|
FATAL_ERROR("DMA RX channel has stalled!");
|
|
}
|
|
|
|
__sync_synchronize();
|
|
spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
__sync_synchronize();
|
|
}
|
|
|
|
#endif
|
|
|
|
void DeinitDMA(void)
|
|
{
|
|
WaitForDMAFinished();
|
|
ResetDMAChannels();
|
|
FreeUncachedGpuMemory(dmaSourceBuffer);
|
|
FreeUncachedGpuMemory(dmaCb);
|
|
FreeUncachedGpuMemory(dmaConstantData);
|
|
if (dmaTxChannel != -1)
|
|
{
|
|
FreeDMAChannel(dmaTxChannel);
|
|
dmaTxChannel = -1;
|
|
}
|
|
if (dmaRxChannel != -1)
|
|
{
|
|
FreeDMAChannel(dmaRxChannel);
|
|
dmaRxChannel = -1;
|
|
}
|
|
}
|
|
|
|
|
|
#endif // ~USE_DMA_TRANSFERS
|