2021-01-27 08:18:51 +00:00

670 lines
26 KiB
C++

#ifndef KERNEL_MODULE
#include <stdio.h> // printf, stderr
#include <syslog.h> // syslog
#include <fcntl.h> // open, O_RDWR, O_SYNC
#include <sys/mman.h> // mmap, munmap
#include <pthread.h> // pthread_create
#include <bcm_host.h> // bcm_host_get_peripheral_address, bcm_host_get_peripheral_size, bcm_host_get_sdram_address
#endif
#include "config.h"
#include "spi.h"
#include "util.h"
#include "dma.h"
#include "mailbox.h"
#include "mem_alloc.h"
// Uncomment this to print out all bytes sent to the SPI bus
// #define DEBUG_SPI_BUS_WRITES
#ifdef DEBUG_SPI_BUS_WRITES
#define DEBUG_PRINT_WRITTEN_BYTE(byte) do { \
printf("%02X", byte); \
if ((writeCounter & 3) == 0) printf("\n"); \
} while(0)
#else
#define DEBUG_PRINT_WRITTEN_BYTE(byte) ((void)0)
#endif
#ifdef CHIP_SELECT_LINE_NEEDS_REFRESHING_EACH_32BITS_WRITTEN
void ChipSelectHigh();
#define TOGGLE_CHIP_SELECT_LINE() if ((++writeCounter & 3) == 0) { ChipSelectHigh(); }
#else
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
#endif
static uint32_t writeCounter = 0;
#define WRITE_FIFO(word) do { \
uint8_t w = (word); \
spi->fifo = w; \
TOGGLE_CHIP_SELECT_LINE(); \
DEBUG_PRINT_WRITTEN_BYTE(w); \
} while(0)
int mem_fd = -1;
volatile void *bcm2835 = 0;
volatile GPIORegisterFile *gpio = 0;
volatile SPIRegisterFile *spi = 0;
// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
volatile uint64_t *systemTimerRegister = 0;
void DumpSPICS(uint32_t reg)
{
PRINT_FLAG(BCM2835_SPI0_CS_CS);
PRINT_FLAG(BCM2835_SPI0_CS_CPHA);
PRINT_FLAG(BCM2835_SPI0_CS_CPOL);
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_TX);
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_RX);
PRINT_FLAG(BCM2835_SPI0_CS_TA);
PRINT_FLAG(BCM2835_SPI0_CS_DMAEN);
PRINT_FLAG(BCM2835_SPI0_CS_INTD);
PRINT_FLAG(BCM2835_SPI0_CS_INTR);
PRINT_FLAG(BCM2835_SPI0_CS_ADCS);
PRINT_FLAG(BCM2835_SPI0_CS_DONE);
PRINT_FLAG(BCM2835_SPI0_CS_RXD);
PRINT_FLAG(BCM2835_SPI0_CS_TXD);
PRINT_FLAG(BCM2835_SPI0_CS_RXR);
PRINT_FLAG(BCM2835_SPI0_CS_RXF);
printf("SPI0 DLEN: %u\n", spi->dlen);
printf("SPI0 CE0 register: %d\n", GET_GPIO(GPIO_SPI0_CE0) ? 1 : 0);
}
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
#include <pthread.h>
#include <sched.h>
void SetRealtimeThreadPriority()
{
sched_param params;
params.sched_priority = sched_get_priority_max(SCHED_FIFO);
int failed = pthread_setschedparam(pthread_self(), SCHED_FIFO, &params);
if (failed) FATAL_ERROR("pthread_setschedparam() failed!");
int policy = 0;
failed = pthread_getschedparam(pthread_self(), &policy, &params);
if (failed) FATAL_ERROR("pthread_getschedparam() failed!");
if (policy != SCHED_FIFO) FATAL_ERROR("Failed to set realtime thread policy!");
printf("Set fbcp-ili9341 thread scheduling priority to maximum (%d)\n", sched_get_priority_max(SCHED_FIFO));
}
#endif
// Errata to BCM2835 behavior: documentation states that the SPI0 DLEN register is only used for DMA. However, even when DMA is not being utilized, setting it from
// a value != 0 or 1 gets rid of an excess idle clock cycle that is present when transmitting each byte. (by default in Polled SPI Mode each 8 bits transfer in 9 clocks)
// With DLEN=2 each byte is clocked to the bus in 8 cycles, observed to improve max throughput from 56.8mbps to 63.3mbps (+11.4%, quite close to the theoretical +12.5%)
// https://www.raspberrypi.org/forums/viewtopic.php?f=44&t=181154
#define UNLOCK_FAST_8_CLOCKS_SPI() (spi->dlen = 2)
#ifdef ALL_TASKS_SHOULD_DMA
bool previousTaskWasSPI = true;
#endif
#ifdef SPI_3WIRE_PROTOCOL
uint32_t NumBytesNeededFor32BitSPITask(uint32_t byteSizeFor8BitTask)
{
return byteSizeFor8BitTask * 2 + 4; // 16bit -> 32bit expansion, plus 4 bytes for command word
}
uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask)
{
uint32_t numOutBits = (byteSizeFor8BitTask + 1) * 9;
// The number of bits we send out in a command must be a multiple of 9 bits, because each byte is 1 data/command bit plus 8 payload bits
// But the number of bits sent out in a command must also be a multiple of 8 bits, because BCM2835 SPI peripheral only deals with sending out full bytes.
// Therefore the bits written out must be a multiple of lcm(9*8)=72bits.
numOutBits = ((numOutBits + 71) / 72) * 72;
uint32_t numOutBytes = numOutBits >> 3;
return numOutBytes;
}
// N.B. BCM2835 hardware always clocks bytes out most significant bit (MSB) first, so when interleaving, the command bit needs to start out in the
// highest byte of the outgoing buffer.
void Interleave8BitSPITaskTo9Bit(SPITask *task)
{
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
// 9-bit SPI task lives right at the end of the 8-bit task
uint8_t *dst = task->data + size8BitTask;
// Pre-clear the 9*8=72 bit tail end of the memory to all zeroes to avoid having to pad source data to multiples of 9. (plus padding bytes, just to be safe)
memset(dst + task->sizeExpandedTaskWithPadding - 9 - SPI_9BIT_TASK_PADDING_BYTES, 0, 9 + SPI_9BIT_TASK_PADDING_BYTES);
// Fill first command byte xxxxxxxx -> 0xxxxxxx x: (low 0 bit to indicate a command byte)
dst[0] = task->cmd >> 1;
dst[1] = task->cmd << 7;
int dstByte = 1;
int dstBitsUsed = 1;
int src = 0;
// Command bit above produced one byte. If there are at least 7 bytes in the data set, we can complete a set of 8 transferred bytes. Fast track
// that:
if (size8BitTask >= 7)
{
dst[1] |= 0x40 | (task->data[0] >> 2);
dst[2] = 0x20 | (task->data[0] << 6) | (task->data[1] >> 3);
dst[3] = 0x10 | (task->data[1] << 5) | (task->data[2] >> 4);
dst[4] = 0x08 | (task->data[2] << 4) | (task->data[3] >> 5);
dst[5] = 0x04 | (task->data[3] << 3) | (task->data[4] >> 6);
dst[6] = 0x02 | (task->data[4] << 2) | (task->data[5] >> 7);
dst[7] = 0x01 | (task->data[5] << 1);
dst[8] = (task->data[6] );
dstByte = 9;
dstBitsUsed = 0;
src = 7;
// More fast tracking: As long as we have multiples of 8 bytes left, fast fill them in
while(src <= size8BitTask - 8)
{
uint8_t *d = dst + dstByte;
dstByte += 9;
const uint8_t *s = task->data + src;
src += 8;
d[0] = 0x80 | (s[0] >> 1);
d[1] = 0x40 | (s[0] << 7) | (s[1] >> 2);
d[2] = 0x20 | (s[1] << 6) | (s[2] >> 3);
d[3] = 0x10 | (s[2] << 5) | (s[3] >> 4);
d[4] = 0x08 | (s[3] << 4) | (s[4] >> 5);
d[5] = 0x04 | (s[4] << 3) | (s[5] >> 6);
d[6] = 0x02 | (s[5] << 2) | (s[6] >> 7);
d[7] = 0x01 | (s[6] << 1);
d[8] = (s[7] );
}
// Pre-clear the next byte to be written - the slow loop below assumes it is continuing a middle of byte sequence
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
dst[dstByte] = 0;
}
// Fill tail data bytes, slow path
while(src < size8BitTask)
{
uint8_t data = task->data[src++];
// High 1 bit to indicate a data byte
dst[dstByte] |= 1 << (7 - dstBitsUsed);
++dstBitsUsed;
if (dstBitsUsed == 8) // Written data bit completes a full byte?
{
++dstByte; // Advance to next byte
dstBitsUsed = 0;
// Now we are aligned, so can write the data byte directly
dst[dstByte++] = data;
dst[dstByte] = 0; // Clear old contents of the next byte to write
}
else
{
// 8 data bits
dst[dstByte++] |= data >> dstBitsUsed;
// This is the first write to the next byte, that should occur without ORring to clear old data in memory
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
dst[dstByte] = data << (8 - dstBitsUsed);
}
}
#if 0 // Enable to debug correctness:
#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
#define BYTE_TO_BINARY(byte) \
(byte & 0x80 ? '1' : '0'), \
(byte & 0x40 ? '1' : '0'), \
(byte & 0x20 ? '1' : '0'), \
(byte & 0x10 ? '1' : '0'), \
(byte & 0x08 ? '1' : '0'), \
(byte & 0x04 ? '1' : '0'), \
(byte & 0x02 ? '1' : '0'), \
(byte & 0x01 ? '1' : '0')
printf("Interleaving result: 8-bit task of size %d bytes became %d bytes:\n", task->size - task->sizeExpandedTaskWithPadding, task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES);
printf("8-bit c" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->cmd));
for(int i = 0; i < task->size - task->sizeExpandedTaskWithPadding; ++i)
printf("d" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->data[i]));
printf("\n9-bit ");
for(int i = 0; i < task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; ++i)
printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(dst[i]));
printf("\n\n");
#endif
}
void Interleave16BitSPITaskTo32Bit(SPITask *task)
{
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
// 32-bit SPI task lives right at the end of the 16-bit task
uint32_t *dst = (uint32_t *)(task->data + size8BitTask);
*dst++ = task->cmd;
const uint32_t taskSizeU16 = size8BitTask >> 1;
uint16_t *src = (uint16_t*)task->data;
for(uint32_t i = 0; i < taskSizeU16; ++i)
dst[i] = 0x1500 | (src[i] << 16);
}
#endif // ~SPI_3WIRE_PROTOCOL
void WaitForPolledSPITransferToFinish()
{
uint32_t cs;
while (!(((cs = spi->cs) ^ BCM2835_SPI0_CS_TA) & (BCM2835_SPI0_CS_DONE | BCM2835_SPI0_CS_TA))) // While TA=1 and DONE=0
if ((cs & (BCM2835_SPI0_CS_RXR | BCM2835_SPI0_CS_RXF)))
spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
if ((cs & BCM2835_SPI0_CS_RXD)) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
#ifdef ALL_TASKS_SHOULD_DMA
#ifndef USE_DMA_TRANSFERS
#error When building with #define ALL_TASKS_SHOULD_DMA enabled, -DUSE_DMA_TRANSFERS=ON should be set in CMake command line!
#endif
// Synchonously performs a single SPI command byte + N data bytes transfer on the calling thread. Call in between a BEGIN_SPI_COMMUNICATION() and END_SPI_COMMUNICATION() pair.
void RunSPITask(SPITask *task)
{
uint32_t cs;
uint8_t *tStart = task->PayloadStart();
uint8_t *tEnd = task->PayloadEnd();
const uint32_t payloadSize = tEnd - tStart;
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
#define TASK_SIZE_TO_USE_DMA 4
// Do a DMA transfer if this task is suitable in size for DMA to handle
if (payloadSize >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y))
{
if (previousTaskWasSPI)
WaitForPolledSPITransferToFinish();
// printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
SPIDMATransfer(task);
previousTaskWasSPI = false;
}
else
{
if (!previousTaskWasSPI)
{
WaitForDMAFinished();
spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR_TX | DISPLAY_SPI_DRIVE_SETTINGS;
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
UNLOCK_FAST_8_CLOCKS_SPI();
}
else
WaitForPolledSPITransferToFinish();
// printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
#ifndef SPI_3WIRE_PROTOCOL
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
WRITE_FIFO(0x00);
#endif
WRITE_FIFO(task->cmd);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
spi->fifo;
spi->fifo;
#else
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
#endif
SET_GPIO(GPIO_TFT_DATA_CONTROL);
#endif
// Send the data payload:
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
while(tStart < tEnd)
{
cs = spi->cs;
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
// TODO: else asm volatile("yield");
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
previousTaskWasSPI = true;
}
}
#else
void RunSPITask(SPITask *task)
{
WaitForPolledSPITransferToFinish();
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout fbcp-ili9341 lifetime,
// which is a tiny bit faster.
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
BEGIN_SPI_COMMUNICATION();
#endif
uint8_t *tStart = task->PayloadStart();
uint8_t *tEnd = task->PayloadEnd();
const uint32_t payloadSize = tEnd - tStart;
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
#ifndef SPI_3WIRE_PROTOCOL
// An SPI transfer to the display always starts with one control (command) byte, followed by N data bytes.
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
WRITE_FIFO(0x00);
#endif
WRITE_FIFO(task->cmd);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
spi->fifo;
spi->fifo;
#else
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
#endif
SET_GPIO(GPIO_TFT_DATA_CONTROL);
#endif // ~!SPI_3WIRE_PROTOCOL
// For small transfers, using DMA is not worth it, but pushing through with polled SPI gives better bandwidth.
// For larger transfers though that are more than this amount of bytes, using DMA is faster.
// This cutoff number was experimentally tested to find where Polled SPI and DMA are as fast.
#define DMA_IS_FASTER_THAN_POLLED_SPI 140
// Do a DMA transfer if this task is suitable in size for DMA to handle
#ifdef USE_DMA_TRANSFERS
if (tEnd - tStart > DMA_IS_FASTER_THAN_POLLED_SPI)
{
SPIDMATransfer(task);
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
UNLOCK_FAST_8_CLOCKS_SPI();
}
else
#endif
{
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
while(tStart < tEnd)
{
uint32_t cs = spi->cs;
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
// TODO: else asm volatile("yield");
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
}
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
END_SPI_COMMUNICATION();
#endif
}
#endif
SharedMemory *spiTaskMemory = 0;
volatile uint64_t spiThreadIdleUsecs = 0;
volatile uint64_t spiThreadSleepStartTime = 0;
volatile int spiThreadSleeping = 0;
double spiUsecsPerByte;
SPITask *GetTask() // Returns the first task in the queue, called in worker thread
{
uint32_t head = spiTaskMemory->queueHead;
uint32_t tail = spiTaskMemory->queueTail;
if (head == tail) return 0;
SPITask *task = (SPITask*)(spiTaskMemory->buffer + head);
if (task->cmd == 0) // Wrapped around?
{
spiTaskMemory->queueHead = 0;
__sync_synchronize();
if (tail == 0) return 0;
task = (SPITask*)spiTaskMemory->buffer;
}
return task;
}
void DoneTask(SPITask *task) // Frees the first SPI task from the queue, called in worker thread
{
__atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED);
spiTaskMemory->queueHead = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size;
__sync_synchronize();
}
extern volatile bool programRunning;
void ExecuteSPITasks()
{
#ifndef USE_DMA_TRANSFERS
BEGIN_SPI_COMMUNICATION();
#endif
{
while(programRunning && spiTaskMemory->queueTail != spiTaskMemory->queueHead)
{
SPITask *task = GetTask();
if (task)
{
RunSPITask(task);
DoneTask(task);
}
}
}
#ifndef USE_DMA_TRANSFERS
END_SPI_COMMUNICATION();
#endif
}
#if !defined(KERNEL_MODULE) && defined(USE_SPI_THREAD)
pthread_t spiThread;
// A worker thread that keeps the SPI bus filled at all times
void *spi_thread(void *unused)
{
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
SetRealtimeThreadPriority();
#endif
while(programRunning)
{
if (spiTaskMemory->queueTail != spiTaskMemory->queueHead)
{
ExecuteSPITasks();
}
else
{
#ifdef STATISTICS
uint64_t t0 = tick();
spiThreadSleepStartTime = t0;
__atomic_store_n(&spiThreadSleeping, 1, __ATOMIC_RELAXED);
#endif
if (programRunning) syscall(SYS_futex, &spiTaskMemory->queueTail, FUTEX_WAIT, spiTaskMemory->queueHead, 0, 0, 0); // Start sleeping until we get new tasks
#ifdef STATISTICS
__atomic_store_n(&spiThreadSleeping, 0, __ATOMIC_RELAXED);
uint64_t t1 = tick();
__sync_fetch_and_add(&spiThreadIdleUsecs, t1-t0);
#endif
}
}
pthread_exit(0);
}
#endif
int InitSPI()
{
#ifdef KERNEL_MODULE
#define BCM2835_PERI_BASE 0x3F000000
#define BCM2835_GPIO_BASE 0x200000
#define BCM2835_SPI0_BASE 0x204000
printk("ioremapping %p\n", (void*)(BCM2835_PERI_BASE+BCM2835_GPIO_BASE));
void *bcm2835 = ioremap(BCM2835_PERI_BASE+BCM2835_GPIO_BASE, 32768);
printk("Got bcm address %p\n", bcm2835);
if (!bcm2835) FATAL_ERROR("Failed to map BCM2835 address!");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE - BCM2835_GPIO_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835);
#else // Userland version
// Memory map GPIO and SPI peripherals for direct access
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#endif
uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
uint32_t maxBcmCoreTurboSpeed = MailboxRet2(0x00030004/*Get Max Clock Rate*/, 0x4/*CORE*/);
// Estimate how many microseconds transferring a single byte over the SPI bus takes?
spiUsecsPerByte = 1000000.0 * 8.0/*bits/byte*/ * SPI_BUS_CLOCK_DIVISOR / maxBcmCoreTurboSpeed;
printf("BCM core speed: current: %uhz, max turbo: %uhz. SPI CDIV: %d, SPI max frequency: %.0fhz\n", currentBcmCoreSpeed, maxBcmCoreTurboSpeed, SPI_BUS_CLOCK_DIVISOR, (double)maxBcmCoreTurboSpeed / SPI_BUS_CLOCK_DIVISOR);
#if !defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES)
// By default all GPIO pins are in input mode (0x00), initialize them for SPI and GPIO writes
#ifdef GPIO_TFT_DATA_CONTROL
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0x01); // Data/Control pin to output (0x01)
#endif
SET_GPIO_MODE(GPIO_SPI0_MISO, 0x04);
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0x04);
SET_GPIO_MODE(GPIO_SPI0_CLK, 0x04);
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout
// fbcp-ili9341 lifetime, which is a tiny bit faster.
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x04);
#ifdef DISPLAY_USES_CS1
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x04);
#endif
#else
// Set the SPI 0 pin explicitly to output, and enable chip select on the line by setting it to low.
// fbcp-ili9341 assumes exclusive access to the SPI0 bus, and exclusive presence of only one device on the bus,
// which is (permanently) activated here.
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x01);
CLEAR_GPIO(GPIO_SPI0_CE0);
#ifdef DISPLAY_USES_CS1
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01);
#endif
#endif
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS; // Initialize the Control and Status register to defaults: CS=0 (Chip Select), CPHA=0 (Clock Phase), CPOL=0 (Clock Polarity), CSPOL=0 (Chip Select Polarity), TA=0 (Transfer not active), and reset TX and RX queues.
spi->clk = SPI_BUS_CLOCK_DIVISOR; // Clock Divider determines SPI bus speed, resulting speed=256MHz/clk
#endif
// Initialize SPI thread task buffer memory
#ifdef KERNEL_MODULE_CLIENT
int driverfd = open("/proc/bcm2835_spi_display_bus", O_RDWR|O_SYNC);
if (driverfd < 0) FATAL_ERROR("Could not open SPI ring buffer - kernel driver module not running?");
spiTaskMemory = (SharedMemory*)mmap(NULL, SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED/* | MAP_NORESERVE | MAP_POPULATE | MAP_LOCKED*/, driverfd, 0);
close(driverfd);
if (spiTaskMemory == MAP_FAILED) FATAL_ERROR("Could not mmap SPI ring buffer!");
printf("Got shared memory block %p, ring buffer head %p, ring buffer tail %p, shared memory block phys address: %p\n", (const char *)spiTaskMemory, spiTaskMemory->queueHead, spiTaskMemory->queueTail, spiTaskMemory->sharedMemoryBaseInPhysMemory);
#ifdef USE_DMA_TRANSFERS
printf("DMA TX channel: %d, DMA RX channel: %d\n", spiTaskMemory->dmaTxChannel, spiTaskMemory->dmaRxChannel);
#endif
#else
#ifdef KERNEL_MODULE
spiTaskMemory = (SharedMemory*)kmalloc(SHARED_MEMORY_SIZE, GFP_KERNEL | GFP_DMA);
// TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer in 'spiTaskMemory'. However
// that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
// Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to the following intermediate 'dmaSourceMemory'
// memory area to perform the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
dmaSourceMemory = (SharedMemory*)dma_alloc_writecombine(0, SHARED_MEMORY_SIZE, &spiTaskMemoryPhysical, GFP_KERNEL);
LOG("Allocated DMA memory: mem: %p, phys: %p", spiTaskMemory, (void*)spiTaskMemoryPhysical);
memset((void*)spiTaskMemory, 0, SHARED_MEMORY_SIZE);
#else
spiTaskMemory = (SharedMemory*)Malloc(SHARED_MEMORY_SIZE, "spi.cpp shared task memory");
#endif
spiTaskMemory->queueHead = spiTaskMemory->queueTail = spiTaskMemory->spiBytesQueued = 0;
#endif
#ifdef USE_DMA_TRANSFERS
InitDMA();
#endif
// Enable fast 8 clocks per byte transfer mode, instead of slower 9 clocks per byte.
UNLOCK_FAST_8_CLOCKS_SPI();
#if !defined(KERNEL_MODULE) && (!defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES))
printf("Initializing display\n");
InitSPIDisplay();
#ifdef USE_SPI_THREAD
// Create a dedicated thread to feed the SPI bus. While this is fast, it consumes a lot of CPU. It would be best to replace
// this thread with a kernel module that processes the created SPI task queue using interrupts. (while juggling the GPIO D/C line as well)
printf("Creating SPI task thread\n");
int rc = pthread_create(&spiThread, NULL, spi_thread, NULL); // After creating the thread, it is assumed to have ownership of the SPI bus, so no SPI chat on the main thread after this.
if (rc != 0) FATAL_ERROR("Failed to create SPI thread!");
#else
// We will be running SPI tasks continuously from the main thread, so keep SPI Transfer Active throughout the lifetime of the driver.
BEGIN_SPI_COMMUNICATION();
#endif
#endif
LOG("InitSPI done");
return 0;
}
void DeinitSPI()
{
#ifdef USE_SPI_THREAD
pthread_join(spiThread, NULL);
spiThread = (pthread_t)0;
#endif
DeinitSPIDisplay();
#ifdef USE_DMA_TRANSFERS
DeinitDMA();
#endif
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
#ifndef KERNEL_MODULE_CLIENT
#ifdef GPIO_TFT_DATA_CONTROL
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0);
#endif
SET_GPIO_MODE(GPIO_SPI0_CE1, 0);
SET_GPIO_MODE(GPIO_SPI0_CE0, 0);
SET_GPIO_MODE(GPIO_SPI0_MISO, 0);
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0);
SET_GPIO_MODE(GPIO_SPI0_CLK, 0);
#endif
if (bcm2835)
{
munmap((void*)bcm2835, bcm_host_get_peripheral_size());
bcm2835 = 0;
}
if (mem_fd >= 0)
{
close(mem_fd);
mem_fd = -1;
}
#ifndef KERNEL_MODULE_CLIENT
#ifdef KERNEL_MODULE
kfree(spiTaskMemory);
dma_free_writecombine(0, SHARED_MEMORY_SIZE, dmaSourceMemory, spiTaskMemoryPhysical);
spiTaskMemoryPhysical = 0;
#else
free(spiTaskMemory);
#endif
#endif
spiTaskMemory = 0;
}