mirror of
https://github.com/goodtft/LCD-show.git
synced 2025-01-19 18:52:16 +01:00
670 lines
26 KiB
C++
670 lines
26 KiB
C++
#ifndef KERNEL_MODULE
|
|
#include <stdio.h> // printf, stderr
|
|
#include <syslog.h> // syslog
|
|
#include <fcntl.h> // open, O_RDWR, O_SYNC
|
|
#include <sys/mman.h> // mmap, munmap
|
|
#include <pthread.h> // pthread_create
|
|
#include <bcm_host.h> // bcm_host_get_peripheral_address, bcm_host_get_peripheral_size, bcm_host_get_sdram_address
|
|
#endif
|
|
|
|
#include "config.h"
|
|
#include "spi.h"
|
|
#include "util.h"
|
|
#include "dma.h"
|
|
#include "mailbox.h"
|
|
#include "mem_alloc.h"
|
|
|
|
// Uncomment this to print out all bytes sent to the SPI bus
|
|
// #define DEBUG_SPI_BUS_WRITES
|
|
|
|
#ifdef DEBUG_SPI_BUS_WRITES
|
|
#define DEBUG_PRINT_WRITTEN_BYTE(byte) do { \
|
|
printf("%02X", byte); \
|
|
if ((writeCounter & 3) == 0) printf("\n"); \
|
|
} while(0)
|
|
#else
|
|
#define DEBUG_PRINT_WRITTEN_BYTE(byte) ((void)0)
|
|
#endif
|
|
|
|
#ifdef CHIP_SELECT_LINE_NEEDS_REFRESHING_EACH_32BITS_WRITTEN
|
|
void ChipSelectHigh();
|
|
#define TOGGLE_CHIP_SELECT_LINE() if ((++writeCounter & 3) == 0) { ChipSelectHigh(); }
|
|
#else
|
|
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
|
|
#endif
|
|
|
|
static uint32_t writeCounter = 0;
|
|
|
|
#define WRITE_FIFO(word) do { \
|
|
uint8_t w = (word); \
|
|
spi->fifo = w; \
|
|
TOGGLE_CHIP_SELECT_LINE(); \
|
|
DEBUG_PRINT_WRITTEN_BYTE(w); \
|
|
} while(0)
|
|
|
|
int mem_fd = -1;
|
|
volatile void *bcm2835 = 0;
|
|
volatile GPIORegisterFile *gpio = 0;
|
|
volatile SPIRegisterFile *spi = 0;
|
|
|
|
// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
|
|
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
|
|
volatile uint64_t *systemTimerRegister = 0;
|
|
|
|
void DumpSPICS(uint32_t reg)
|
|
{
|
|
PRINT_FLAG(BCM2835_SPI0_CS_CS);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_CPHA);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_CPOL);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_TX);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_RX);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_TA);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_DMAEN);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_INTD);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_INTR);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_ADCS);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_DONE);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_RXD);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_TXD);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_RXR);
|
|
PRINT_FLAG(BCM2835_SPI0_CS_RXF);
|
|
printf("SPI0 DLEN: %u\n", spi->dlen);
|
|
printf("SPI0 CE0 register: %d\n", GET_GPIO(GPIO_SPI0_CE0) ? 1 : 0);
|
|
}
|
|
|
|
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
|
|
|
|
#include <pthread.h>
|
|
#include <sched.h>
|
|
|
|
void SetRealtimeThreadPriority()
|
|
{
|
|
sched_param params;
|
|
params.sched_priority = sched_get_priority_max(SCHED_FIFO);
|
|
|
|
int failed = pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶ms);
|
|
if (failed) FATAL_ERROR("pthread_setschedparam() failed!");
|
|
|
|
int policy = 0;
|
|
failed = pthread_getschedparam(pthread_self(), &policy, ¶ms);
|
|
if (failed) FATAL_ERROR("pthread_getschedparam() failed!");
|
|
|
|
if (policy != SCHED_FIFO) FATAL_ERROR("Failed to set realtime thread policy!");
|
|
printf("Set fbcp-ili9341 thread scheduling priority to maximum (%d)\n", sched_get_priority_max(SCHED_FIFO));
|
|
}
|
|
|
|
#endif
|
|
|
|
// Errata to BCM2835 behavior: documentation states that the SPI0 DLEN register is only used for DMA. However, even when DMA is not being utilized, setting it from
|
|
// a value != 0 or 1 gets rid of an excess idle clock cycle that is present when transmitting each byte. (by default in Polled SPI Mode each 8 bits transfer in 9 clocks)
|
|
// With DLEN=2 each byte is clocked to the bus in 8 cycles, observed to improve max throughput from 56.8mbps to 63.3mbps (+11.4%, quite close to the theoretical +12.5%)
|
|
// https://www.raspberrypi.org/forums/viewtopic.php?f=44&t=181154
|
|
#define UNLOCK_FAST_8_CLOCKS_SPI() (spi->dlen = 2)
|
|
|
|
#ifdef ALL_TASKS_SHOULD_DMA
|
|
bool previousTaskWasSPI = true;
|
|
#endif
|
|
|
|
#ifdef SPI_3WIRE_PROTOCOL
|
|
|
|
uint32_t NumBytesNeededFor32BitSPITask(uint32_t byteSizeFor8BitTask)
|
|
{
|
|
return byteSizeFor8BitTask * 2 + 4; // 16bit -> 32bit expansion, plus 4 bytes for command word
|
|
}
|
|
|
|
uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask)
|
|
{
|
|
uint32_t numOutBits = (byteSizeFor8BitTask + 1) * 9;
|
|
// The number of bits we send out in a command must be a multiple of 9 bits, because each byte is 1 data/command bit plus 8 payload bits
|
|
// But the number of bits sent out in a command must also be a multiple of 8 bits, because BCM2835 SPI peripheral only deals with sending out full bytes.
|
|
// Therefore the bits written out must be a multiple of lcm(9*8)=72bits.
|
|
numOutBits = ((numOutBits + 71) / 72) * 72;
|
|
uint32_t numOutBytes = numOutBits >> 3;
|
|
return numOutBytes;
|
|
}
|
|
|
|
// N.B. BCM2835 hardware always clocks bytes out most significant bit (MSB) first, so when interleaving, the command bit needs to start out in the
|
|
// highest byte of the outgoing buffer.
|
|
void Interleave8BitSPITaskTo9Bit(SPITask *task)
|
|
{
|
|
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
|
|
|
|
// 9-bit SPI task lives right at the end of the 8-bit task
|
|
uint8_t *dst = task->data + size8BitTask;
|
|
|
|
// Pre-clear the 9*8=72 bit tail end of the memory to all zeroes to avoid having to pad source data to multiples of 9. (plus padding bytes, just to be safe)
|
|
memset(dst + task->sizeExpandedTaskWithPadding - 9 - SPI_9BIT_TASK_PADDING_BYTES, 0, 9 + SPI_9BIT_TASK_PADDING_BYTES);
|
|
|
|
// Fill first command byte xxxxxxxx -> 0xxxxxxx x: (low 0 bit to indicate a command byte)
|
|
dst[0] = task->cmd >> 1;
|
|
dst[1] = task->cmd << 7;
|
|
int dstByte = 1;
|
|
int dstBitsUsed = 1;
|
|
|
|
int src = 0;
|
|
|
|
// Command bit above produced one byte. If there are at least 7 bytes in the data set, we can complete a set of 8 transferred bytes. Fast track
|
|
// that:
|
|
if (size8BitTask >= 7)
|
|
{
|
|
dst[1] |= 0x40 | (task->data[0] >> 2);
|
|
dst[2] = 0x20 | (task->data[0] << 6) | (task->data[1] >> 3);
|
|
dst[3] = 0x10 | (task->data[1] << 5) | (task->data[2] >> 4);
|
|
dst[4] = 0x08 | (task->data[2] << 4) | (task->data[3] >> 5);
|
|
dst[5] = 0x04 | (task->data[3] << 3) | (task->data[4] >> 6);
|
|
dst[6] = 0x02 | (task->data[4] << 2) | (task->data[5] >> 7);
|
|
dst[7] = 0x01 | (task->data[5] << 1);
|
|
dst[8] = (task->data[6] );
|
|
dstByte = 9;
|
|
dstBitsUsed = 0;
|
|
src = 7;
|
|
|
|
// More fast tracking: As long as we have multiples of 8 bytes left, fast fill them in
|
|
while(src <= size8BitTask - 8)
|
|
{
|
|
uint8_t *d = dst + dstByte;
|
|
dstByte += 9;
|
|
const uint8_t *s = task->data + src;
|
|
src += 8;
|
|
|
|
d[0] = 0x80 | (s[0] >> 1);
|
|
d[1] = 0x40 | (s[0] << 7) | (s[1] >> 2);
|
|
d[2] = 0x20 | (s[1] << 6) | (s[2] >> 3);
|
|
d[3] = 0x10 | (s[2] << 5) | (s[3] >> 4);
|
|
d[4] = 0x08 | (s[3] << 4) | (s[4] >> 5);
|
|
d[5] = 0x04 | (s[4] << 3) | (s[5] >> 6);
|
|
d[6] = 0x02 | (s[5] << 2) | (s[6] >> 7);
|
|
d[7] = 0x01 | (s[6] << 1);
|
|
d[8] = (s[7] );
|
|
}
|
|
|
|
// Pre-clear the next byte to be written - the slow loop below assumes it is continuing a middle of byte sequence
|
|
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
|
|
dst[dstByte] = 0;
|
|
}
|
|
|
|
// Fill tail data bytes, slow path
|
|
while(src < size8BitTask)
|
|
{
|
|
uint8_t data = task->data[src++];
|
|
|
|
// High 1 bit to indicate a data byte
|
|
dst[dstByte] |= 1 << (7 - dstBitsUsed);
|
|
++dstBitsUsed;
|
|
if (dstBitsUsed == 8) // Written data bit completes a full byte?
|
|
{
|
|
++dstByte; // Advance to next byte
|
|
dstBitsUsed = 0;
|
|
// Now we are aligned, so can write the data byte directly
|
|
dst[dstByte++] = data;
|
|
dst[dstByte] = 0; // Clear old contents of the next byte to write
|
|
}
|
|
else
|
|
{
|
|
// 8 data bits
|
|
dst[dstByte++] |= data >> dstBitsUsed;
|
|
// This is the first write to the next byte, that should occur without ORring to clear old data in memory
|
|
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
|
|
dst[dstByte] = data << (8 - dstBitsUsed);
|
|
}
|
|
}
|
|
|
|
#if 0 // Enable to debug correctness:
|
|
|
|
#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
|
|
#define BYTE_TO_BINARY(byte) \
|
|
(byte & 0x80 ? '1' : '0'), \
|
|
(byte & 0x40 ? '1' : '0'), \
|
|
(byte & 0x20 ? '1' : '0'), \
|
|
(byte & 0x10 ? '1' : '0'), \
|
|
(byte & 0x08 ? '1' : '0'), \
|
|
(byte & 0x04 ? '1' : '0'), \
|
|
(byte & 0x02 ? '1' : '0'), \
|
|
(byte & 0x01 ? '1' : '0')
|
|
|
|
printf("Interleaving result: 8-bit task of size %d bytes became %d bytes:\n", task->size - task->sizeExpandedTaskWithPadding, task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES);
|
|
printf("8-bit c" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->cmd));
|
|
for(int i = 0; i < task->size - task->sizeExpandedTaskWithPadding; ++i)
|
|
printf("d" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->data[i]));
|
|
printf("\n9-bit ");
|
|
for(int i = 0; i < task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; ++i)
|
|
printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(dst[i]));
|
|
printf("\n\n");
|
|
#endif
|
|
|
|
}
|
|
|
|
void Interleave16BitSPITaskTo32Bit(SPITask *task)
|
|
{
|
|
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
|
|
|
|
// 32-bit SPI task lives right at the end of the 16-bit task
|
|
uint32_t *dst = (uint32_t *)(task->data + size8BitTask);
|
|
*dst++ = task->cmd;
|
|
|
|
const uint32_t taskSizeU16 = size8BitTask >> 1;
|
|
uint16_t *src = (uint16_t*)task->data;
|
|
for(uint32_t i = 0; i < taskSizeU16; ++i)
|
|
dst[i] = 0x1500 | (src[i] << 16);
|
|
}
|
|
|
|
#endif // ~SPI_3WIRE_PROTOCOL
|
|
|
|
void WaitForPolledSPITransferToFinish()
|
|
{
|
|
uint32_t cs;
|
|
while (!(((cs = spi->cs) ^ BCM2835_SPI0_CS_TA) & (BCM2835_SPI0_CS_DONE | BCM2835_SPI0_CS_TA))) // While TA=1 and DONE=0
|
|
if ((cs & (BCM2835_SPI0_CS_RXR | BCM2835_SPI0_CS_RXF)))
|
|
spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
|
|
if ((cs & BCM2835_SPI0_CS_RXD)) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
}
|
|
|
|
#ifdef ALL_TASKS_SHOULD_DMA
|
|
|
|
#ifndef USE_DMA_TRANSFERS
|
|
#error When building with #define ALL_TASKS_SHOULD_DMA enabled, -DUSE_DMA_TRANSFERS=ON should be set in CMake command line!
|
|
#endif
|
|
|
|
// Synchonously performs a single SPI command byte + N data bytes transfer on the calling thread. Call in between a BEGIN_SPI_COMMUNICATION() and END_SPI_COMMUNICATION() pair.
|
|
void RunSPITask(SPITask *task)
|
|
{
|
|
uint32_t cs;
|
|
uint8_t *tStart = task->PayloadStart();
|
|
uint8_t *tEnd = task->PayloadEnd();
|
|
const uint32_t payloadSize = tEnd - tStart;
|
|
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
|
|
|
|
#define TASK_SIZE_TO_USE_DMA 4
|
|
// Do a DMA transfer if this task is suitable in size for DMA to handle
|
|
if (payloadSize >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y))
|
|
{
|
|
if (previousTaskWasSPI)
|
|
WaitForPolledSPITransferToFinish();
|
|
// printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
|
|
SPIDMATransfer(task);
|
|
previousTaskWasSPI = false;
|
|
}
|
|
else
|
|
{
|
|
if (!previousTaskWasSPI)
|
|
{
|
|
WaitForDMAFinished();
|
|
spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR_TX | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
|
|
UNLOCK_FAST_8_CLOCKS_SPI();
|
|
}
|
|
else
|
|
WaitForPolledSPITransferToFinish();
|
|
|
|
// printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
|
|
|
|
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
|
|
#ifndef SPI_3WIRE_PROTOCOL
|
|
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
|
|
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
|
|
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
|
|
WRITE_FIFO(0x00);
|
|
#endif
|
|
WRITE_FIFO(task->cmd);
|
|
|
|
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
spi->fifo;
|
|
spi->fifo;
|
|
#else
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
#endif
|
|
|
|
SET_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
#endif
|
|
|
|
// Send the data payload:
|
|
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
|
|
while(tStart < tEnd)
|
|
{
|
|
cs = spi->cs;
|
|
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
|
|
// TODO: else asm volatile("yield");
|
|
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
}
|
|
|
|
previousTaskWasSPI = true;
|
|
}
|
|
}
|
|
#else
|
|
|
|
void RunSPITask(SPITask *task)
|
|
{
|
|
WaitForPolledSPITransferToFinish();
|
|
|
|
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
|
|
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
|
|
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout fbcp-ili9341 lifetime,
|
|
// which is a tiny bit faster.
|
|
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
|
|
BEGIN_SPI_COMMUNICATION();
|
|
#endif
|
|
|
|
uint8_t *tStart = task->PayloadStart();
|
|
uint8_t *tEnd = task->PayloadEnd();
|
|
const uint32_t payloadSize = tEnd - tStart;
|
|
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
|
|
|
|
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
|
|
#ifndef SPI_3WIRE_PROTOCOL
|
|
// An SPI transfer to the display always starts with one control (command) byte, followed by N data bytes.
|
|
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
|
|
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
|
|
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
|
|
WRITE_FIFO(0x00);
|
|
#endif
|
|
WRITE_FIFO(task->cmd);
|
|
|
|
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
spi->fifo;
|
|
spi->fifo;
|
|
#else
|
|
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
|
|
#endif
|
|
|
|
SET_GPIO(GPIO_TFT_DATA_CONTROL);
|
|
#endif // ~!SPI_3WIRE_PROTOCOL
|
|
|
|
// For small transfers, using DMA is not worth it, but pushing through with polled SPI gives better bandwidth.
|
|
// For larger transfers though that are more than this amount of bytes, using DMA is faster.
|
|
// This cutoff number was experimentally tested to find where Polled SPI and DMA are as fast.
|
|
#define DMA_IS_FASTER_THAN_POLLED_SPI 140
|
|
// Do a DMA transfer if this task is suitable in size for DMA to handle
|
|
#ifdef USE_DMA_TRANSFERS
|
|
if (tEnd - tStart > DMA_IS_FASTER_THAN_POLLED_SPI)
|
|
{
|
|
SPIDMATransfer(task);
|
|
|
|
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
|
|
UNLOCK_FAST_8_CLOCKS_SPI();
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
|
|
while(tStart < tEnd)
|
|
{
|
|
uint32_t cs = spi->cs;
|
|
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
|
|
// TODO: else asm volatile("yield");
|
|
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
}
|
|
}
|
|
|
|
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
|
|
END_SPI_COMMUNICATION();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
SharedMemory *spiTaskMemory = 0;
|
|
volatile uint64_t spiThreadIdleUsecs = 0;
|
|
volatile uint64_t spiThreadSleepStartTime = 0;
|
|
volatile int spiThreadSleeping = 0;
|
|
double spiUsecsPerByte;
|
|
|
|
SPITask *GetTask() // Returns the first task in the queue, called in worker thread
|
|
{
|
|
uint32_t head = spiTaskMemory->queueHead;
|
|
uint32_t tail = spiTaskMemory->queueTail;
|
|
if (head == tail) return 0;
|
|
SPITask *task = (SPITask*)(spiTaskMemory->buffer + head);
|
|
if (task->cmd == 0) // Wrapped around?
|
|
{
|
|
spiTaskMemory->queueHead = 0;
|
|
__sync_synchronize();
|
|
if (tail == 0) return 0;
|
|
task = (SPITask*)spiTaskMemory->buffer;
|
|
}
|
|
return task;
|
|
}
|
|
|
|
void DoneTask(SPITask *task) // Frees the first SPI task from the queue, called in worker thread
|
|
{
|
|
__atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED);
|
|
spiTaskMemory->queueHead = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size;
|
|
__sync_synchronize();
|
|
}
|
|
|
|
extern volatile bool programRunning;
|
|
|
|
void ExecuteSPITasks()
|
|
{
|
|
#ifndef USE_DMA_TRANSFERS
|
|
BEGIN_SPI_COMMUNICATION();
|
|
#endif
|
|
{
|
|
while(programRunning && spiTaskMemory->queueTail != spiTaskMemory->queueHead)
|
|
{
|
|
SPITask *task = GetTask();
|
|
if (task)
|
|
{
|
|
RunSPITask(task);
|
|
DoneTask(task);
|
|
}
|
|
}
|
|
}
|
|
#ifndef USE_DMA_TRANSFERS
|
|
END_SPI_COMMUNICATION();
|
|
#endif
|
|
}
|
|
|
|
#if !defined(KERNEL_MODULE) && defined(USE_SPI_THREAD)
|
|
pthread_t spiThread;
|
|
|
|
// A worker thread that keeps the SPI bus filled at all times
|
|
void *spi_thread(void *unused)
|
|
{
|
|
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
|
|
SetRealtimeThreadPriority();
|
|
#endif
|
|
while(programRunning)
|
|
{
|
|
if (spiTaskMemory->queueTail != spiTaskMemory->queueHead)
|
|
{
|
|
ExecuteSPITasks();
|
|
}
|
|
else
|
|
{
|
|
#ifdef STATISTICS
|
|
uint64_t t0 = tick();
|
|
spiThreadSleepStartTime = t0;
|
|
__atomic_store_n(&spiThreadSleeping, 1, __ATOMIC_RELAXED);
|
|
#endif
|
|
if (programRunning) syscall(SYS_futex, &spiTaskMemory->queueTail, FUTEX_WAIT, spiTaskMemory->queueHead, 0, 0, 0); // Start sleeping until we get new tasks
|
|
#ifdef STATISTICS
|
|
__atomic_store_n(&spiThreadSleeping, 0, __ATOMIC_RELAXED);
|
|
uint64_t t1 = tick();
|
|
__sync_fetch_and_add(&spiThreadIdleUsecs, t1-t0);
|
|
#endif
|
|
}
|
|
}
|
|
pthread_exit(0);
|
|
}
|
|
#endif
|
|
|
|
int InitSPI()
|
|
{
|
|
#ifdef KERNEL_MODULE
|
|
|
|
#define BCM2835_PERI_BASE 0x3F000000
|
|
#define BCM2835_GPIO_BASE 0x200000
|
|
#define BCM2835_SPI0_BASE 0x204000
|
|
printk("ioremapping %p\n", (void*)(BCM2835_PERI_BASE+BCM2835_GPIO_BASE));
|
|
void *bcm2835 = ioremap(BCM2835_PERI_BASE+BCM2835_GPIO_BASE, 32768);
|
|
printk("Got bcm address %p\n", bcm2835);
|
|
if (!bcm2835) FATAL_ERROR("Failed to map BCM2835 address!");
|
|
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE - BCM2835_GPIO_BASE);
|
|
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835);
|
|
|
|
#else // Userland version
|
|
// Memory map GPIO and SPI peripherals for direct access
|
|
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
|
|
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
|
|
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
|
|
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
|
|
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
|
|
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
|
|
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
|
|
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
|
|
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
|
|
#endif
|
|
|
|
uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
|
|
uint32_t maxBcmCoreTurboSpeed = MailboxRet2(0x00030004/*Get Max Clock Rate*/, 0x4/*CORE*/);
|
|
|
|
// Estimate how many microseconds transferring a single byte over the SPI bus takes?
|
|
spiUsecsPerByte = 1000000.0 * 8.0/*bits/byte*/ * SPI_BUS_CLOCK_DIVISOR / maxBcmCoreTurboSpeed;
|
|
|
|
printf("BCM core speed: current: %uhz, max turbo: %uhz. SPI CDIV: %d, SPI max frequency: %.0fhz\n", currentBcmCoreSpeed, maxBcmCoreTurboSpeed, SPI_BUS_CLOCK_DIVISOR, (double)maxBcmCoreTurboSpeed / SPI_BUS_CLOCK_DIVISOR);
|
|
|
|
#if !defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES)
|
|
// By default all GPIO pins are in input mode (0x00), initialize them for SPI and GPIO writes
|
|
#ifdef GPIO_TFT_DATA_CONTROL
|
|
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0x01); // Data/Control pin to output (0x01)
|
|
#endif
|
|
SET_GPIO_MODE(GPIO_SPI0_MISO, 0x04);
|
|
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0x04);
|
|
SET_GPIO_MODE(GPIO_SPI0_CLK, 0x04);
|
|
|
|
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
|
|
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
|
|
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
|
|
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout
|
|
// fbcp-ili9341 lifetime, which is a tiny bit faster.
|
|
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x04);
|
|
#ifdef DISPLAY_USES_CS1
|
|
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x04);
|
|
#endif
|
|
#else
|
|
// Set the SPI 0 pin explicitly to output, and enable chip select on the line by setting it to low.
|
|
// fbcp-ili9341 assumes exclusive access to the SPI0 bus, and exclusive presence of only one device on the bus,
|
|
// which is (permanently) activated here.
|
|
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x01);
|
|
CLEAR_GPIO(GPIO_SPI0_CE0);
|
|
#ifdef DISPLAY_USES_CS1
|
|
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01);
|
|
#endif
|
|
#endif
|
|
|
|
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS; // Initialize the Control and Status register to defaults: CS=0 (Chip Select), CPHA=0 (Clock Phase), CPOL=0 (Clock Polarity), CSPOL=0 (Chip Select Polarity), TA=0 (Transfer not active), and reset TX and RX queues.
|
|
spi->clk = SPI_BUS_CLOCK_DIVISOR; // Clock Divider determines SPI bus speed, resulting speed=256MHz/clk
|
|
#endif
|
|
|
|
// Initialize SPI thread task buffer memory
|
|
#ifdef KERNEL_MODULE_CLIENT
|
|
int driverfd = open("/proc/bcm2835_spi_display_bus", O_RDWR|O_SYNC);
|
|
if (driverfd < 0) FATAL_ERROR("Could not open SPI ring buffer - kernel driver module not running?");
|
|
spiTaskMemory = (SharedMemory*)mmap(NULL, SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED/* | MAP_NORESERVE | MAP_POPULATE | MAP_LOCKED*/, driverfd, 0);
|
|
close(driverfd);
|
|
if (spiTaskMemory == MAP_FAILED) FATAL_ERROR("Could not mmap SPI ring buffer!");
|
|
printf("Got shared memory block %p, ring buffer head %p, ring buffer tail %p, shared memory block phys address: %p\n", (const char *)spiTaskMemory, spiTaskMemory->queueHead, spiTaskMemory->queueTail, spiTaskMemory->sharedMemoryBaseInPhysMemory);
|
|
|
|
#ifdef USE_DMA_TRANSFERS
|
|
printf("DMA TX channel: %d, DMA RX channel: %d\n", spiTaskMemory->dmaTxChannel, spiTaskMemory->dmaRxChannel);
|
|
#endif
|
|
|
|
#else
|
|
|
|
#ifdef KERNEL_MODULE
|
|
spiTaskMemory = (SharedMemory*)kmalloc(SHARED_MEMORY_SIZE, GFP_KERNEL | GFP_DMA);
|
|
// TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer in 'spiTaskMemory'. However
|
|
// that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
|
|
// Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to the following intermediate 'dmaSourceMemory'
|
|
// memory area to perform the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
|
|
dmaSourceMemory = (SharedMemory*)dma_alloc_writecombine(0, SHARED_MEMORY_SIZE, &spiTaskMemoryPhysical, GFP_KERNEL);
|
|
LOG("Allocated DMA memory: mem: %p, phys: %p", spiTaskMemory, (void*)spiTaskMemoryPhysical);
|
|
memset((void*)spiTaskMemory, 0, SHARED_MEMORY_SIZE);
|
|
#else
|
|
spiTaskMemory = (SharedMemory*)Malloc(SHARED_MEMORY_SIZE, "spi.cpp shared task memory");
|
|
#endif
|
|
|
|
spiTaskMemory->queueHead = spiTaskMemory->queueTail = spiTaskMemory->spiBytesQueued = 0;
|
|
#endif
|
|
|
|
#ifdef USE_DMA_TRANSFERS
|
|
InitDMA();
|
|
#endif
|
|
|
|
// Enable fast 8 clocks per byte transfer mode, instead of slower 9 clocks per byte.
|
|
UNLOCK_FAST_8_CLOCKS_SPI();
|
|
|
|
#if !defined(KERNEL_MODULE) && (!defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES))
|
|
printf("Initializing display\n");
|
|
InitSPIDisplay();
|
|
|
|
#ifdef USE_SPI_THREAD
|
|
// Create a dedicated thread to feed the SPI bus. While this is fast, it consumes a lot of CPU. It would be best to replace
|
|
// this thread with a kernel module that processes the created SPI task queue using interrupts. (while juggling the GPIO D/C line as well)
|
|
printf("Creating SPI task thread\n");
|
|
int rc = pthread_create(&spiThread, NULL, spi_thread, NULL); // After creating the thread, it is assumed to have ownership of the SPI bus, so no SPI chat on the main thread after this.
|
|
if (rc != 0) FATAL_ERROR("Failed to create SPI thread!");
|
|
#else
|
|
// We will be running SPI tasks continuously from the main thread, so keep SPI Transfer Active throughout the lifetime of the driver.
|
|
BEGIN_SPI_COMMUNICATION();
|
|
#endif
|
|
|
|
#endif
|
|
|
|
LOG("InitSPI done");
|
|
return 0;
|
|
}
|
|
|
|
void DeinitSPI()
|
|
{
|
|
#ifdef USE_SPI_THREAD
|
|
pthread_join(spiThread, NULL);
|
|
spiThread = (pthread_t)0;
|
|
#endif
|
|
DeinitSPIDisplay();
|
|
#ifdef USE_DMA_TRANSFERS
|
|
DeinitDMA();
|
|
#endif
|
|
|
|
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
|
|
|
|
#ifndef KERNEL_MODULE_CLIENT
|
|
#ifdef GPIO_TFT_DATA_CONTROL
|
|
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0);
|
|
#endif
|
|
SET_GPIO_MODE(GPIO_SPI0_CE1, 0);
|
|
SET_GPIO_MODE(GPIO_SPI0_CE0, 0);
|
|
SET_GPIO_MODE(GPIO_SPI0_MISO, 0);
|
|
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0);
|
|
SET_GPIO_MODE(GPIO_SPI0_CLK, 0);
|
|
#endif
|
|
|
|
if (bcm2835)
|
|
{
|
|
munmap((void*)bcm2835, bcm_host_get_peripheral_size());
|
|
bcm2835 = 0;
|
|
}
|
|
|
|
if (mem_fd >= 0)
|
|
{
|
|
close(mem_fd);
|
|
mem_fd = -1;
|
|
}
|
|
|
|
#ifndef KERNEL_MODULE_CLIENT
|
|
|
|
#ifdef KERNEL_MODULE
|
|
kfree(spiTaskMemory);
|
|
dma_free_writecombine(0, SHARED_MEMORY_SIZE, dmaSourceMemory, spiTaskMemoryPhysical);
|
|
spiTaskMemoryPhysical = 0;
|
|
#else
|
|
free(spiTaskMemory);
|
|
#endif
|
|
#endif
|
|
spiTaskMemory = 0;
|
|
}
|