#ifndef KERNEL_MODULE
#include <stdio.h> // fprintf, stderr
#include <stdlib.h> // exit
#include <memory.h> // memset, memcpy
#include <inttypes.h> // uint32_t
#include <syslog.h> // syslog
#include <sys/mman.h> // mmap, munmap, PROT_READ, PROT_WRITE
#endif

#include "config.h"
#include "dma.h"
#include "spi.h"
#include "gpu.h"
#include "util.h"
#include "mailbox.h"

#ifdef USE_DMA_TRANSFERS

#define BCM2835_PERI_BASE               0x3F000000

SharedMemory *dmaSourceMemory = 0;
volatile DMAChannelRegisterFile *dma0 = 0;

volatile DMAChannelRegisterFile *dmaTx = 0;
volatile DMAChannelRegisterFile *dmaRx = 0;
int dmaTxChannel = -1;
int dmaTxIrq = 0;
int dmaRxChannel = -1;
int dmaRxIrq = 0;

#define PAGE_SIZE 4096

struct GpuMemory
{
  uint32_t allocationHandle;
  void *virtualAddr;
  uintptr_t busAddress;
  uint32_t sizeBytes;
};

#define NUM_DMA_CBS 1024
GpuMemory dmaCb, dmaSourceBuffer, dmaConstantData;

volatile DMAControlBlock *dmaSendTail = 0;
volatile DMAControlBlock *dmaRecvTail = 0;
volatile DMAControlBlock *firstFreeCB = 0;
volatile uint8_t *dmaSourceEnd = 0;

volatile DMAControlBlock *GrabFreeCBs(int num)
{
  volatile DMAControlBlock *firstCB = (volatile DMAControlBlock *)dmaCb.virtualAddr;
  volatile DMAControlBlock *endCB = firstCB + NUM_DMA_CBS;
  if ((uintptr_t)(firstFreeCB + num) >= (uintptr_t)dmaCb.virtualAddr + dmaCb.sizeBytes)
  {
    WaitForDMAFinished();
    firstFreeCB = firstCB;
  }

  volatile DMAControlBlock *ret = firstFreeCB;
  firstFreeCB += num;
  return ret;
}

volatile uint8_t *GrabFreeDMASourceBytes(int bytes)
{
  if ((uintptr_t)dmaSourceEnd + bytes >= (uintptr_t)dmaSourceBuffer.virtualAddr + dmaSourceBuffer.sizeBytes)
  {
    WaitForDMAFinished();
    dmaSourceEnd = (volatile uint8_t *)dmaSourceBuffer.virtualAddr;
  }

  volatile uint8_t *ret = dmaSourceEnd;
  dmaSourceEnd += bytes;
  return ret;
}

static int AllocateDMAChannel(int *dmaChannel, int *irq)
{
  // Snooping DMA, channels 3, 5 and 6 seen active.
  // TODO: Actually reserve the DMA channel to the system using bcm_dma_chan_alloc() and bcm_dma_chan_free()?...
  // Right now, use channels 1 and 4 which seem to be free.
  // Note: The send channel could be a lite channel, but receive channel cannot, since receiving uses the IGNORE flag
  // that lite DMA engines don't have.
#ifdef FREEPLAYTECH_WAVESHARE32B
  // On FreePlayTech Zero, DMA channel 4 seen to be taken by SD HOST (peripheral mapping 13).
  int freeChannels[] = { 5, 1 };
#else
  int freeChannels[] = { 7, 1 };
#endif
#if defined(DMA_TX_CHANNEL)
  freeChannels[0] = DMA_TX_CHANNEL;
#endif
#if defined(DMA_RX_CHANNEL)
  freeChannels[1] = DMA_RX_CHANNEL;
#endif
  if (freeChannels[0] == freeChannels[1]) FATAL_ERROR("DMA TX and RX channels cannot be the same channel!");

  static int nextFreeChannel = 0;
  if (nextFreeChannel >= sizeof(freeChannels) / sizeof(freeChannels[0])) FATAL_ERROR("No free DMA channels");

  *dmaChannel = freeChannels[nextFreeChannel++];
  LOG("Allocated DMA channel %d", *dmaChannel);
  *irq = 0;
  return 0;
}

void FreeDMAChannel(int channel)
{
  volatile DMAChannelRegisterFile *dma = GetDMAChannel(channel);
  dma->cb.ti = 0; // Clear the SPI TX & RX permaps for this DMA channel so that we don't think some other program is using these for SPI
}

// Message IDs for different mailbox GPU memory allocation messages
#define MEM_ALLOC_MESSAGE 0x3000c // This message is 3 u32s: numBytes, alignment and flags
#define MEM_FREE_MESSAGE 0x3000f // This message is 1 u32: handle
#define MEM_LOCK_MESSAGE 0x3000d // 1 u32: handle
#define MEM_UNLOCK_MESSAGE 0x3000e // 1 u32: handle

// Memory allocation flags
#define MEM_ALLOC_FLAG_DIRECT (1 << 2) // Allocate uncached memory that bypasses L1 and L2 cache on loads and stores
#define MEM_ALLOC_FLAG_COHERENT (1 << 3) // Non-allocating in L2 but coherent

#define BUS_TO_PHYS(x) ((x) & ~0xC0000000)

#define PHYS_TO_BUS(x) ((x) |  0xC0000000)

#define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress)

uint64_t totalGpuMemoryUsed = 0;

// Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block.
// The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use
// this kind of memory to pass data blocks over to the DMA controller to process.
GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason)
{
  GpuMemory mem;
  mem.sizeBytes = ALIGN_UP(numBytes, PAGE_SIZE);
  uint32_t allocationFlags = MEM_ALLOC_FLAG_DIRECT | MEM_ALLOC_FLAG_COHERENT;
  mem.allocationHandle = Mailbox(MEM_ALLOC_MESSAGE, /*size=*/mem.sizeBytes, /*alignment=*/PAGE_SIZE, /*flags=*/allocationFlags);
  if (!mem.allocationHandle) FATAL_ERROR("Failed to allocate GPU memory! Try increasing gpu_mem allocation in /boot/config.txt. See https://www.raspberrypi.org/documentation/configuration/config-txt/memory.md");
  mem.busAddress = Mailbox(MEM_LOCK_MESSAGE, mem.allocationHandle);
  if (!mem.busAddress) FATAL_ERROR("Failed to lock GPU memory!");
  mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, BUS_TO_PHYS(mem.busAddress));
  if (mem.virtualAddr == MAP_FAILED) FATAL_ERROR("Failed to mmap GPU memory!");
  totalGpuMemoryUsed += mem.sizeBytes;
//  printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed);
  return mem;
}

void FreeUncachedGpuMemory(GpuMemory mem)
{
  totalGpuMemoryUsed -= mem.sizeBytes;
  munmap(mem.virtualAddr, mem.sizeBytes);
  Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle);
  Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle);
}

volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber)
{
  if (channelNumber < 0 || channelNumber >= BCM2835_NUM_DMA_CHANNELS)
  {
    printf("Invalid DMA channel %d specified!\n", channelNumber);
    FATAL_ERROR("Invalid DMA channel specified!");
  }
  return dma0 + channelNumber;
}

void DumpDMAPeripheralMap()
{
  for(int i = 0; i < BCM2835_NUM_DMA_CHANNELS; ++i)
  {
    volatile DMAChannelRegisterFile *channel = GetDMAChannel(i);
    printf("DMA channel %d has peripheral map %d (is lite channel: %d, currently active: %d, current control block: %p)\n", i, (channel->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT, (channel->cb.debug & BCM2835_DMA_DEBUG_LITE) ? 1 : 0, (channel->cs & BCM2835_DMA_CS_ACTIVE) ? 1 : 0, channel->cbAddr);
  }
}

// Verifies that no other program has stomped on the DMA channel that we are using.
void CheckDMAChannelNotStolen(int channelNumber, int expectedPeripheralMap)
{
  volatile DMAChannelRegisterFile *channel = GetDMAChannel(channelNumber);
  uint32_t peripheralMap = ((channel->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT);
  if (peripheralMap != expectedPeripheralMap && peripheralMap != 0)
  {
    DumpDMAPeripheralMap();
    printf("DMA channel collision! DMA channel %d was expected to be assigned to our peripheral %d, but something else has assigned it to peripheral %d!\n", channelNumber, expectedPeripheralMap, peripheralMap);
    FATAL_ERROR("System is likely unstable now, rebooting is advised.");
  }
  uint32_t cbAddr = channel->cbAddr;
  if (cbAddr && (cbAddr < dmaCb.busAddress || cbAddr >= dmaCb.busAddress + dmaCb.sizeBytes))
  {
    DumpDMAPeripheralMap();
    printf("DMA channel collision! Some other program has submitted a DMA task to our DMA channel %d! (DMA task at unknown control block address %p)\n", channelNumber, cbAddr);
    FATAL_ERROR("System is likely unstable now, rebooting is advised.");
  }
}

void CheckSPIDMAChannelsNotStolen()
{
  CheckDMAChannelNotStolen(dmaTxChannel, BCM2835_DMA_TI_PERMAP_SPI_TX);
  CheckDMAChannelNotStolen(dmaRxChannel, BCM2835_DMA_TI_PERMAP_SPI_RX);
}

void ResetDMAChannels()
{
  dmaTx->cs = BCM2835_DMA_CS_RESET;
  dmaTx->cb.debug = BCM2835_DMA_DEBUG_DMA_READ_ERROR | BCM2835_DMA_DEBUG_DMA_FIFO_ERROR | BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR;
  dmaRx->cs = BCM2835_DMA_CS_RESET;
  dmaRx->cb.debug = BCM2835_DMA_DEBUG_DMA_READ_ERROR | BCM2835_DMA_DEBUG_DMA_FIFO_ERROR | BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR;
}

int InitDMA()
{
#if defined(KERNEL_MODULE)
  dma0 = (volatile DMAChannelRegisterFile*)ioremap(BCM2835_PERI_BASE+BCM2835_DMA0_OFFSET, BCM2835_NUM_DMA_CHANNELS*0x100);
#else
  dma0 = (volatile DMAChannelRegisterFile*)((uintptr_t)bcm2835 + BCM2835_DMA0_OFFSET);
#endif

#ifdef KERNEL_MODULE_CLIENT
  dmaTxChannel = spiTaskMemory->dmaTxChannel;
  dmaRxChannel = spiTaskMemory->dmaRxChannel;
#else
  int ret = AllocateDMAChannel(&dmaTxChannel, &dmaTxIrq);
  if (ret != 0) FATAL_ERROR("Unable to allocate TX DMA channel!");
  ret = AllocateDMAChannel(&dmaRxChannel, &dmaRxIrq);
  if (ret != 0) FATAL_ERROR("Unable to allocate RX DMA channel!");

  printf("Enabling DMA channels Tx:%d and Rx:%d\n", dmaTxChannel, dmaRxChannel);
  volatile uint32_t *dmaEnableRegister = (volatile uint32_t *)((uintptr_t)dma0 + BCM2835_DMAENABLE_REGISTER_OFFSET);

  // Enable the allocated DMA channels
  *dmaEnableRegister |= (1 << dmaTxChannel);
  *dmaEnableRegister |= (1 << dmaRxChannel);
#endif

#if !defined(KERNEL_MODULE)
  dmaCb = AllocateUncachedGpuMemory(sizeof(DMAControlBlock) * NUM_DMA_CBS, "DMA control blocks");
  memset(dmaCb.virtualAddr, 0, dmaCb.sizeBytes); // Some fields of the CBs (debug, reserved) are initialized to zero and assumed to stay so throughout app lifetime.
  firstFreeCB = (volatile DMAControlBlock *)dmaCb.virtualAddr;

  dmaSourceBuffer = AllocateUncachedGpuMemory(SHARED_MEMORY_SIZE*2, "DMA source data");
  dmaSourceEnd = (volatile uint8_t *)dmaSourceBuffer.virtualAddr;

  dmaConstantData = AllocateUncachedGpuMemory(2*sizeof(uint32_t), "DMA constant data");
  uint32_t *constantData = (uint32_t *)dmaConstantData.virtualAddr;
  constantData[0] = BCM2835_SPI0_CS_DMAEN; // constantData[0] is for disableTransferActive task
  constantData[1] = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END; // constantData[1] is for startDMATxChannel task
#endif

  LOG("DMA hardware register file is at ptr: %p, using DMA TX channel: %d and DMA RX channel: %d", dma0, dmaTxChannel, dmaRxChannel);
  if (!dma0) FATAL_ERROR("Failed to map DMA!");

  dmaTx = GetDMAChannel(dmaTxChannel);
  dmaRx = GetDMAChannel(dmaRxChannel);
  LOG("DMA hardware TX channel register file is at ptr: %p, DMA RX channel register file is at ptr: %p", dmaTx, dmaRx);
  int dmaTxPeripheralMap = (dmaTx->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT;
  if (dmaTxPeripheralMap != 0 && dmaTxPeripheralMap != BCM2835_DMA_TI_PERMAP_SPI_TX)
  {
    DumpDMAPeripheralMap();
    LOG("DMA TX channel %d was assigned another peripheral map %d!", dmaTxChannel, dmaTxPeripheralMap);
    FATAL_ERROR("DMA TX channel was assigned another peripheral map!");
  }
  if (dmaTx->cbAddr != 0 && (dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
    FATAL_ERROR("DMA TX channel was in use!");

  int dmaRxPeripheralMap = (dmaRx->cb.ti & BCM2835_DMA_TI_PERMAP_MASK) >> BCM2835_DMA_TI_PERMAP_SHIFT;
  if (dmaRxPeripheralMap != 0 && dmaRxPeripheralMap != BCM2835_DMA_TI_PERMAP_SPI_RX)
  {
    LOG("DMA RX channel %d was assigned another peripheral map %d!", dmaRxChannel, dmaRxPeripheralMap);
    DumpDMAPeripheralMap();
    FATAL_ERROR("DMA RX channel was assigned another peripheral map!");
  }
  if (dmaRx->cbAddr != 0 && (dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
    FATAL_ERROR("DMA RX channel was in use!");

  if ((dmaRx->cb.debug & BCM2835_DMA_DEBUG_LITE) != 0)
    FATAL_ERROR("DMA RX channel cannot be a lite channel, because to get best performance we want to use BCM2835_DMA_TI_DEST_IGNORE DMA operation mode that lite DMA channels do not have. (Try using DMA RX channel value < 7)");

  LOG("Resetting DMA channels for use");
  ResetDMAChannels();

  // TODO: Set up IRQ
  LOG("DMA all set up");
  return 0;
}

// Debugging functions to introspect SPI and DMA hardware registers:

void DumpCS(uint32_t reg)
{
  PRINT_FLAG(BCM2835_DMA_CS_RESET);
  PRINT_FLAG(BCM2835_DMA_CS_ABORT);
  PRINT_FLAG(BCM2835_DMA_CS_DISDEBUG);
  PRINT_FLAG(BCM2835_DMA_CS_WAIT_FOR_OUTSTANDING_WRITES);
  PRINT_FLAG(BCM2835_DMA_CS_PANIC_PRIORITY);
  PRINT_FLAG(BCM2835_DMA_CS_PRIORITY);
  PRINT_FLAG(BCM2835_DMA_CS_ERROR);
  PRINT_FLAG(BCM2835_DMA_CS_WAITING_FOR_OUTSTANDING_WRITES);
  PRINT_FLAG(BCM2835_DMA_CS_DREQ_STOPS_DMA);
  PRINT_FLAG(BCM2835_DMA_CS_PAUSED);
  PRINT_FLAG(BCM2835_DMA_CS_DREQ);
  PRINT_FLAG(BCM2835_DMA_CS_INT);
  PRINT_FLAG(BCM2835_DMA_CS_END);
  PRINT_FLAG(BCM2835_DMA_CS_ACTIVE);
}

void DumpDebug(uint32_t reg)
{
  PRINT_FLAG(BCM2835_DMA_DEBUG_LITE);
  PRINT_FLAG(BCM2835_DMA_DEBUG_VERSION);
  PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_STATE);
  PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_ID);
  PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_OUTSTANDING_WRITES);
  PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_READ_ERROR);
  PRINT_FLAG(BCM2835_DMA_DEBUG_DMA_FIFO_ERROR);
  PRINT_FLAG(BCM2835_DMA_DEBUG_READ_LAST_NOT_SET_ERROR);
}

void DumpTI(uint32_t reg)
{
  PRINT_FLAG(BCM2835_DMA_TI_NO_WIDE_BURSTS);
  PRINT_FLAG(BCM2835_DMA_TI_WAITS);
#define BCM2835_DMA_TI_PERMAP_MASK_SHIFT                    16
  PRINT_FLAG(BCM2835_DMA_TI_PERMAP_MASK);
//  PRINT_FLAG(BCM2835_DMA_TI_BURST_LENGTH);
  PRINT_FLAG(BCM2835_DMA_TI_SRC_IGNORE);
  PRINT_FLAG(BCM2835_DMA_TI_SRC_DREQ);
  PRINT_FLAG(BCM2835_DMA_TI_SRC_WIDTH);
  PRINT_FLAG(BCM2835_DMA_TI_SRC_INC);
  PRINT_FLAG(BCM2835_DMA_TI_DEST_IGNORE);
  PRINT_FLAG(BCM2835_DMA_TI_DEST_DREQ);
  PRINT_FLAG(BCM2835_DMA_TI_DEST_WIDTH);
  PRINT_FLAG(BCM2835_DMA_TI_DEST_INC);
  PRINT_FLAG(BCM2835_DMA_TI_WAIT_RESP);
  PRINT_FLAG(BCM2835_DMA_TI_TDMODE);
  PRINT_FLAG(BCM2835_DMA_TI_INTEN);
}

#define DMA_DMA0_CB_PHYS_ADDRESS 0x7E007000

#define DMA_SPI_CS_PHYS_ADDRESS 0x7E204000
#define DMA_SPI_FIFO_PHYS_ADDRESS 0x7E204004
#define DMA_SPI_DLEN_PHYS_ADDRESS 0x7E20400C
#define DMA_GPIO_SET_PHYS_ADDRESS 0x7E20001C
#define DMA_GPIO_CLEAR_PHYS_ADDRESS 0x7E200028

void DumpDMAState()
{
  printf("---SPI:---\n");
  DumpSPICS(spi->cs);
  printf("---DMATX CS:---\n");
  DumpCS(dmaTx->cs);
  printf("---DMATX TI:---\n");
  DumpTI(dmaTx->cb.ti);
  printf("---DMATX DEBUG:---\n");
  DumpDebug(dmaTx->cb.debug);
  printf("****** DMATX cbAddr: %p\n", dmaTx->cbAddr);

  printf("---DMARX CS:---\n");
  DumpCS(dmaRx->cs);
  printf("---DMARX TI:---\n");
  DumpTI(dmaRx->cb.ti);
  printf("---DMARX DEBUG:---\n");
  DumpDebug(dmaRx->cb.debug);
  printf("****** DMARX cbAddr: %p\n", dmaRx->cbAddr);
}

extern volatile bool programRunning;

void WaitForDMAFinished()
{
  int spins = 0;
  uint64_t t0 = tick();
  while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
  {
    usleep(100);
    if (tick() - t0 > 2000000)
    {
      printf("TX stalled\n");
      DumpDMAState();
      exit(1);
    }
  }
  spins = 0;
  t0 = tick();
  while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
  {
    usleep(100);
    if (tick() - t0 > 2000000)
    {
      printf("RX stalled\n");
      DumpDMAState();
      exit(1);
    }
  }
  dmaSendTail = 0;
  dmaRecvTail = 0;
}

#ifdef ALL_TASKS_SHOULD_DMA

// This function does a memcpy from one source buffer to two destination buffers simultaneously.
// It saves a lot of time on ARMv6 by avoiding to have to do two separate memory copies, because the ARMv6 L1 cache is so tiny (4K) that it cannot fit a whole framebuffer
// in memory at a time. Streaming through it only once instead of twice helps memory bandwidth immensely, this is profiled to be ~4x faster than a pair of memcpys or a simple CPU loop.
// In addition, this does a little endian->big endian conversion when copying data out to dstDma.
static void memcpy_to_dma_and_prev_framebuffer(uint16_t *dstDma, uint16_t **dstPrevFramebuffer, uint16_t **srcFramebuffer, int numBytes, int *taskStartX, int width, int stride)
{
  int strideEnd = stride - width*2;
  int xLeft = width-*taskStartX;

  uint16_t *Src = *srcFramebuffer;
  uint16_t *Dst1 = *dstPrevFramebuffer;

  // TODO: Do the loops in aligned order with unaligned head and tail separate, and ensure that dstDma, dstPrevFramebuffer and srcFramebuffer are in same alignment phase.
  asm volatile(
  "start_%=:\n"
    "ldrd r0, r1, [%[srcFramebuffer]], #8\n"
    "pld [%[srcFramebuffer], #248]\n"
    "strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
    "rev16 r0, r0\n"
    "rev16 r1, r1\n"
    "strd r0, r1, [%[dstDma]], #8\n"

    "ldrd r0, r1, [%[srcFramebuffer]], #8\n"
    "strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
    "rev16 r0, r0\n"
    "rev16 r1, r1\n"
    "strd r0, r1, [%[dstDma]], #8\n"

    "ldrd r0, r1, [%[srcFramebuffer]], #8\n"
    "strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
    "rev16 r0, r0\n"
    "rev16 r1, r1\n"
    "strd r0, r1, [%[dstDma]], #8\n"

    "ldrd r0, r1, [%[srcFramebuffer]], #8\n"
    "strd r0, r1, [%[dstPrevFramebuffer]], #8\n"
    "rev16 r0, r0\n"
    "rev16 r1, r1\n"
    "strd r0, r1, [%[dstDma]], #8\n"

    "subs %[xLeft], %[xLeft], #16\n"
    "addls %[xLeft], %[xLeft], %[width]\n"
    "addls %[dstPrevFramebuffer], %[dstPrevFramebuffer], %[strideEnd]\n"
    "addls %[srcFramebuffer], %[srcFramebuffer], %[strideEnd]\n"

    "subs %[numBytes], %[numBytes], #32\n"
    "bhi start_%=\n"

    : [dstDma]"+r"(dstDma), [dstPrevFramebuffer]"+r"(Dst1), [srcFramebuffer]"+r"(Src), [xLeft]"+r"(xLeft), [numBytes]"+r"(numBytes)
    : [strideEnd]"r"(strideEnd), [width]"r"(width)
    : "r0", "r1", "memory", "cc"
  );
  *taskStartX = width - xLeft;
  *srcFramebuffer = Src;
  *dstPrevFramebuffer = Dst1;
}

static void memcpy_to_dma_and_prev_framebuffer_in_c(uint16_t *dstDma, uint16_t **dstPrevFramebuffer, uint16_t **srcFramebuffer, int numBytes, int *taskStartX, int width, int stride)
{
  static bool performanceWarningPrinted = false;
  if (!performanceWarningPrinted)
  {
    printf("Performance warning: using slow memcpy_to_dma_and_prev_framebuffer_in_c() function. Check conditions in display.h that enable OFFLOAD_PIXEL_COPY_TO_DMA_CPP and configure to use that instead.\n");
    performanceWarningPrinted = true;
  }
  int numPixels = numBytes>>1;
  int endStridePixels = (stride>>1) - width;
  uint16_t *prevData = *dstPrevFramebuffer;
  uint16_t *data = *srcFramebuffer;
  for(int i = 0; i < numPixels; ++i)
  {
    *prevData++ = *data;
    dstDma[i] = __builtin_bswap16(*data++);
    if (++*taskStartX >= width)
    {
      *taskStartX = 0;
      data += endStridePixels;
      prevData += endStridePixels;
    }
  }
  *srcFramebuffer = data;
  *dstPrevFramebuffer = prevData;
}

#if defined(ALL_TASKS_SHOULD_DMA) && defined(SPI_3WIRE_PROTOCOL)
// Bug: there is something about the chained DMA transfer mechanism that makes write window coordinate set commands not go through properly
// on 3-wire displays, but do not yet know what. (Remove this #error statement to debug)
#error ALL_TASKS_SHOULD_DMA and SPI_3WIRE_PROTOCOL are currently not mutually compatible!
#endif

#if defined(OFFLOAD_PIXEL_COPY_TO_DMA_CPP) && defined(SPI_3WIRE_PROTOCOL)
// We would have to convert 8-bit tasks to 9-bit tasks immediately after offloaded memcpy has been done below to implement this.
#error OFFLOAD_PIXEL_COPY_TO_DMA_CPP and SPI_3WIRE_PROTOCOL are not mutually compatible!
#endif

void SPIDMATransfer(SPITask *task)
{
// There is a limit to how many bytes can be sent in one DMA-based SPI task, so if the task
// is larger than this, we'll split the send into multiple individual DMA SPI transfers
// and chain them together. This should be a multiple of 32 bytes to keep tasks cache aligned on ARMv6.
#define MAX_DMA_SPI_TASK_SIZE 65504

  const int numDMASendTasks = (task->PayloadSize() + MAX_DMA_SPI_TASK_SIZE - 1) / MAX_DMA_SPI_TASK_SIZE;

  volatile uint32_t *dmaData = (volatile uint32_t *)GrabFreeDMASourceBytes(4*(numDMASendTasks-1)+4*numDMASendTasks+task->PayloadSize());
  volatile uint32_t *setDMATxAddressData = dmaData;
  volatile uint32_t *txData = dmaData+numDMASendTasks-1;

  volatile DMAControlBlock *cb = GrabFreeCBs(numDMASendTasks*5-3);

  volatile DMAControlBlock *rxTail = 0;
  volatile DMAControlBlock *tx0 = &cb[0];
  volatile DMAControlBlock *rx0 = &cb[1];

#ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
  uint8_t *data = task->fb;
  uint8_t *prevData = task->prevFb;
  const bool taskAndFramebufferSizesCompatibleWithTightMemcpy = (task->PayloadSize() % 32 == 0) && (task->width % 16 == 0);
#else
  uint8_t *data = task->PayloadStart();
#endif

  int bytesLeft = task->PayloadSize();
  int taskStartX = 0;

  while(bytesLeft > 0)
  {
    int sendSize = MIN(bytesLeft, MAX_DMA_SPI_TASK_SIZE);
    bytesLeft -= sendSize;

    volatile DMAControlBlock *tx = cb++;
    txData[0] = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (sendSize << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.
    // This is really sad: we must do a memcpy to prepare for DMA controller to be able to do a memcpy. The reason for this is that the DMA source memory area must be in cache bypassing
    // region of memory, which the SPI source ring buffer is not. It could be allocated to be so however, but bypassing the caches on the SPI ring buffer would cause a massive -51.5%
    // profiled overall performance drop (tested on Pi3B+ and Tontec 3.5" 480x320 display on gpu test pattern, see branch non_intermediate_memcpy_for_dma). Therefore just keep doing
    // this memcpy() to prepare for DMA to do its memcpy(), as it is faster overall. (If there was a way to map same physical memory to virtual address space twice, once cached, and
    // another time uncached, and have writes bypass the cache and only write combine, but have reads follow the cache, then it might work without a perf hit, but not at all sure if
    // that would be technically possible)
    uint16_t *txPtr = (uint16_t*)(txData+1);

    // If task->prevFb is present, the DMA backend is responsible for streaming pixel data from current framebuffer to old framebuffer, and the DMA task buffer.
    // If not present, then that preparation has been already done by the caller.
#ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
    if (prevData)
    {
      // For 2D pixel data, do a "everything in one pass"
      if (taskAndFramebufferSizesCompatibleWithTightMemcpy)
        memcpy_to_dma_and_prev_framebuffer((uint16_t*)txPtr, (uint16_t**)&prevData, (uint16_t**)&data, sendSize, &taskStartX, task->width, gpuFramebufferScanlineStrideBytes);
      else
        memcpy_to_dma_and_prev_framebuffer_in_c((uint16_t*)txPtr, (uint16_t**)&prevData, (uint16_t**)&data, sendSize, &taskStartX, task->width, gpuFramebufferScanlineStrideBytes);
    }
    else
#endif
    {
      memcpy(txPtr, data, sendSize);
      data += sendSize;
    }

    tx->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_TX) | BCM2835_DMA_TI_DEST_DREQ | BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_WAIT_RESP;
    tx->src = VIRT_TO_BUS(dmaSourceBuffer, txData);
    tx->dst = DMA_SPI_FIFO_PHYS_ADDRESS; // Write out to the SPI peripheral
    tx->len = 4+sendSize;
    tx->next = 0;
    txData += 1+sendSize/4;

    volatile DMAControlBlock *rx = cb++;
    rx->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_RX) | BCM2835_DMA_TI_SRC_DREQ | BCM2835_DMA_TI_DEST_IGNORE;
    rx->src = DMA_SPI_FIFO_PHYS_ADDRESS;
    rx->dst = 0;
    rx->len = sendSize;
    rx->next = 0;

    if (rxTail)
    {
      volatile DMAControlBlock *setDMATxAddress = cb++;
      volatile DMAControlBlock *disableTransferActive = cb++;
      volatile DMAControlBlock *startDMATxChannel = cb++;

      rxTail->next = VIRT_TO_BUS(dmaCb, setDMATxAddress);

      setDMATxAddressData[0] = VIRT_TO_BUS(dmaCb, tx);
      setDMATxAddress->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
      setDMATxAddress->src = VIRT_TO_BUS(dmaSourceBuffer, setDMATxAddressData);
      setDMATxAddress->dst = DMA_DMA0_CB_PHYS_ADDRESS + dmaTxChannel*0x100 + 4;
      setDMATxAddress->len = 4;
      setDMATxAddress->next = VIRT_TO_BUS(dmaCb, disableTransferActive);
      ++setDMATxAddressData;

      disableTransferActive->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
      disableTransferActive->src = dmaConstantData.busAddress;
      disableTransferActive->dst = DMA_SPI_CS_PHYS_ADDRESS;
      disableTransferActive->len = 4;
      disableTransferActive->next = VIRT_TO_BUS(dmaCb, startDMATxChannel);

      startDMATxChannel->ti = BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_DEST_INC | BCM2835_DMA_TI_WAIT_RESP;
      startDMATxChannel->src = dmaConstantData.busAddress+4;
      startDMATxChannel->dst = DMA_DMA0_CB_PHYS_ADDRESS + dmaTxChannel*0x100;
      startDMATxChannel->len = 4;
      startDMATxChannel->next = VIRT_TO_BUS(dmaCb, rx);

    }
    rxTail = rx;
  }

  static uint64_t taskStartTime = 0;
  static int pendingTaskBytes = 1;
  double pendingTaskUSecs = pendingTaskBytes * spiUsecsPerByte;
  pendingTaskUSecs -= tick() - taskStartTime;
  if (pendingTaskUSecs > 70)
    usleep(pendingTaskUSecs-70);

  uint64_t dmaTaskStart = tick();

  CheckSPIDMAChannelsNotStolen();
  while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
  {
    usleep(250);
    CheckSPIDMAChannelsNotStolen();
    if (tick() - dmaTaskStart > 5000000)
    {
      DumpDMAState();
      FATAL_ERROR("DMA TX channel has stalled!");
    }
  }
  while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE) && programRunning)
  {
    usleep(250);
    CheckSPIDMAChannelsNotStolen();
    if (tick() - dmaTaskStart > 5000000)
    {
      DumpDMAState();
      FATAL_ERROR("DMA RX channel has stalled!");
    }
  }
  if (!programRunning) return;

  pendingTaskBytes = task->PayloadSize();

  // First send the SPI command byte in Polled SPI mode
  spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;

#ifndef SPI_3WIRE_PROTOCOL
  CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
  spi->fifo = 0;
  spi->fifo = task->cmd;
  while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
  // spi->fifo; // Currently no need to flush these, the clear below clears the rx queue.
  // spi->fifo;
#else
  spi->fifo = task->cmd;
  while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
  // spi->fifo; // Currently no need to flush this, the clear below clears the rx queue.
#endif

  SET_GPIO(GPIO_TFT_DATA_CONTROL);
#endif

  spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;

  dmaTx->cbAddr = VIRT_TO_BUS(dmaCb, tx0);
  dmaRx->cbAddr = VIRT_TO_BUS(dmaCb, rx0);
  __sync_synchronize();
  dmaTx->cs = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END;
  dmaRx->cs = BCM2835_DMA_CS_ACTIVE | BCM2835_DMA_CS_END;
  taskStartTime = tick();
}

#else

void SPIDMATransfer(SPITask *task)
{
  // Transition the SPI peripheral to enable the use of DMA
  spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
  uint32_t *headerAddr = task->DmaSpiHeaderAddress();
  *headerAddr = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (task->PayloadSize() << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.

  // TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer from 'task' pointer. However
  // that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
  // Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to an intermediate 'dmaSourceMemory' memory area to perform
  // the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
  memcpy(dmaSourceBuffer.virtualAddr, headerAddr, task->PayloadSize() + 4);

  volatile DMAControlBlock *cb = (volatile DMAControlBlock *)dmaCb.virtualAddr;
  volatile DMAControlBlock *txcb = &cb[0];
  txcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_TX) | BCM2835_DMA_TI_DEST_DREQ | BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_WAIT_RESP;
  txcb->src = dmaSourceBuffer.busAddress;
  txcb->dst = DMA_SPI_FIFO_PHYS_ADDRESS; // Write out to the SPI peripheral 
  txcb->len = task->PayloadSize() + 4;
  txcb->stride = 0;
  txcb->next = 0;
  txcb->debug = 0;
  txcb->reserved = 0;
  dmaTx->cbAddr = dmaCb.busAddress;

  volatile DMAControlBlock *rxcb = &cb[1];
  rxcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_RX) | BCM2835_DMA_TI_SRC_DREQ | BCM2835_DMA_TI_DEST_IGNORE;
  rxcb->src = DMA_SPI_FIFO_PHYS_ADDRESS;
  rxcb->dst = 0;
  rxcb->len = task->PayloadSize();
  rxcb->stride = 0;
  rxcb->next = 0;
  rxcb->debug = 0;
  rxcb->reserved = 0;
  dmaRx->cbAddr = dmaCb.busAddress + sizeof(DMAControlBlock);

  __sync_synchronize();
  dmaTx->cs = BCM2835_DMA_CS_ACTIVE;
  dmaRx->cs = BCM2835_DMA_CS_ACTIVE;
  __sync_synchronize();

  double pendingTaskUSecs = task->PayloadSize() * spiUsecsPerByte;
  if (pendingTaskUSecs > 70)
    usleep(pendingTaskUSecs-70);

  uint64_t dmaTaskStart = tick();

  CheckSPIDMAChannelsNotStolen();
  while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
  {
    CheckSPIDMAChannelsNotStolen();
    if (tick() - dmaTaskStart > 5000000)
      FATAL_ERROR("DMA TX channel has stalled!");
  }
  while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
  {
    CheckSPIDMAChannelsNotStolen();
    if (tick() - dmaTaskStart > 5000000)
      FATAL_ERROR("DMA RX channel has stalled!");
  }

  __sync_synchronize();
  spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
  __sync_synchronize();
}

#endif

void DeinitDMA(void)
{
  WaitForDMAFinished();
  ResetDMAChannels();
  FreeUncachedGpuMemory(dmaSourceBuffer);
  FreeUncachedGpuMemory(dmaCb);
  FreeUncachedGpuMemory(dmaConstantData);
  if (dmaTxChannel != -1)
  {
    FreeDMAChannel(dmaTxChannel);
    dmaTxChannel = -1;
  }
  if (dmaRxChannel != -1)
  {
    FreeDMAChannel(dmaRxChannel);
    dmaRxChannel = -1;
  }
}


#endif // ~USE_DMA_TRANSFERS