#ifndef KERNEL_MODULE #include // printf, stderr #include // syslog #include // open, O_RDWR, O_SYNC #include // mmap, munmap #include // pthread_create #include // bcm_host_get_peripheral_address, bcm_host_get_peripheral_size, bcm_host_get_sdram_address #endif #include "config.h" #include "spi.h" #include "util.h" #include "dma.h" #include "mailbox.h" #include "mem_alloc.h" // Uncomment this to print out all bytes sent to the SPI bus // #define DEBUG_SPI_BUS_WRITES #ifdef DEBUG_SPI_BUS_WRITES #define DEBUG_PRINT_WRITTEN_BYTE(byte) do { \ printf("%02X", byte); \ if ((writeCounter & 3) == 0) printf("\n"); \ } while(0) #else #define DEBUG_PRINT_WRITTEN_BYTE(byte) ((void)0) #endif #ifdef CHIP_SELECT_LINE_NEEDS_REFRESHING_EACH_32BITS_WRITTEN void ChipSelectHigh(); #define TOGGLE_CHIP_SELECT_LINE() if ((++writeCounter & 3) == 0) { ChipSelectHigh(); } #else #define TOGGLE_CHIP_SELECT_LINE() ((void)0) #endif static uint32_t writeCounter = 0; #define WRITE_FIFO(word) do { \ uint8_t w = (word); \ spi->fifo = w; \ TOGGLE_CHIP_SELECT_LINE(); \ DEBUG_PRINT_WRITTEN_BYTE(w); \ } while(0) int mem_fd = -1; volatile void *bcm2835 = 0; volatile GPIORegisterFile *gpio = 0; volatile SPIRegisterFile *spi = 0; // Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows // that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)". volatile uint64_t *systemTimerRegister = 0; void DumpSPICS(uint32_t reg) { PRINT_FLAG(BCM2835_SPI0_CS_CS); PRINT_FLAG(BCM2835_SPI0_CS_CPHA); PRINT_FLAG(BCM2835_SPI0_CS_CPOL); PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_TX); PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_RX); PRINT_FLAG(BCM2835_SPI0_CS_TA); PRINT_FLAG(BCM2835_SPI0_CS_DMAEN); PRINT_FLAG(BCM2835_SPI0_CS_INTD); PRINT_FLAG(BCM2835_SPI0_CS_INTR); PRINT_FLAG(BCM2835_SPI0_CS_ADCS); PRINT_FLAG(BCM2835_SPI0_CS_DONE); PRINT_FLAG(BCM2835_SPI0_CS_RXD); PRINT_FLAG(BCM2835_SPI0_CS_TXD); PRINT_FLAG(BCM2835_SPI0_CS_RXR); PRINT_FLAG(BCM2835_SPI0_CS_RXF); printf("SPI0 DLEN: %u\n", spi->dlen); printf("SPI0 CE0 register: %d\n", GET_GPIO(GPIO_SPI0_CE0) ? 1 : 0); } #ifdef RUN_WITH_REALTIME_THREAD_PRIORITY #include #include void SetRealtimeThreadPriority() { sched_param params; params.sched_priority = sched_get_priority_max(SCHED_FIFO); int failed = pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶ms); if (failed) FATAL_ERROR("pthread_setschedparam() failed!"); int policy = 0; failed = pthread_getschedparam(pthread_self(), &policy, ¶ms); if (failed) FATAL_ERROR("pthread_getschedparam() failed!"); if (policy != SCHED_FIFO) FATAL_ERROR("Failed to set realtime thread policy!"); printf("Set fbcp-ili9341 thread scheduling priority to maximum (%d)\n", sched_get_priority_max(SCHED_FIFO)); } #endif // Errata to BCM2835 behavior: documentation states that the SPI0 DLEN register is only used for DMA. However, even when DMA is not being utilized, setting it from // a value != 0 or 1 gets rid of an excess idle clock cycle that is present when transmitting each byte. (by default in Polled SPI Mode each 8 bits transfer in 9 clocks) // With DLEN=2 each byte is clocked to the bus in 8 cycles, observed to improve max throughput from 56.8mbps to 63.3mbps (+11.4%, quite close to the theoretical +12.5%) // https://www.raspberrypi.org/forums/viewtopic.php?f=44&t=181154 #define UNLOCK_FAST_8_CLOCKS_SPI() (spi->dlen = 2) #ifdef ALL_TASKS_SHOULD_DMA bool previousTaskWasSPI = true; #endif #ifdef SPI_3WIRE_PROTOCOL uint32_t NumBytesNeededFor32BitSPITask(uint32_t byteSizeFor8BitTask) { return byteSizeFor8BitTask * 2 + 4; // 16bit -> 32bit expansion, plus 4 bytes for command word } uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask) { uint32_t numOutBits = (byteSizeFor8BitTask + 1) * 9; // The number of bits we send out in a command must be a multiple of 9 bits, because each byte is 1 data/command bit plus 8 payload bits // But the number of bits sent out in a command must also be a multiple of 8 bits, because BCM2835 SPI peripheral only deals with sending out full bytes. // Therefore the bits written out must be a multiple of lcm(9*8)=72bits. numOutBits = ((numOutBits + 71) / 72) * 72; uint32_t numOutBytes = numOutBits >> 3; return numOutBytes; } // N.B. BCM2835 hardware always clocks bytes out most significant bit (MSB) first, so when interleaving, the command bit needs to start out in the // highest byte of the outgoing buffer. void Interleave8BitSPITaskTo9Bit(SPITask *task) { const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding; // 9-bit SPI task lives right at the end of the 8-bit task uint8_t *dst = task->data + size8BitTask; // Pre-clear the 9*8=72 bit tail end of the memory to all zeroes to avoid having to pad source data to multiples of 9. (plus padding bytes, just to be safe) memset(dst + task->sizeExpandedTaskWithPadding - 9 - SPI_9BIT_TASK_PADDING_BYTES, 0, 9 + SPI_9BIT_TASK_PADDING_BYTES); // Fill first command byte xxxxxxxx -> 0xxxxxxx x: (low 0 bit to indicate a command byte) dst[0] = task->cmd >> 1; dst[1] = task->cmd << 7; int dstByte = 1; int dstBitsUsed = 1; int src = 0; // Command bit above produced one byte. If there are at least 7 bytes in the data set, we can complete a set of 8 transferred bytes. Fast track // that: if (size8BitTask >= 7) { dst[1] |= 0x40 | (task->data[0] >> 2); dst[2] = 0x20 | (task->data[0] << 6) | (task->data[1] >> 3); dst[3] = 0x10 | (task->data[1] << 5) | (task->data[2] >> 4); dst[4] = 0x08 | (task->data[2] << 4) | (task->data[3] >> 5); dst[5] = 0x04 | (task->data[3] << 3) | (task->data[4] >> 6); dst[6] = 0x02 | (task->data[4] << 2) | (task->data[5] >> 7); dst[7] = 0x01 | (task->data[5] << 1); dst[8] = (task->data[6] ); dstByte = 9; dstBitsUsed = 0; src = 7; // More fast tracking: As long as we have multiples of 8 bytes left, fast fill them in while(src <= size8BitTask - 8) { uint8_t *d = dst + dstByte; dstByte += 9; const uint8_t *s = task->data + src; src += 8; d[0] = 0x80 | (s[0] >> 1); d[1] = 0x40 | (s[0] << 7) | (s[1] >> 2); d[2] = 0x20 | (s[1] << 6) | (s[2] >> 3); d[3] = 0x10 | (s[2] << 5) | (s[3] >> 4); d[4] = 0x08 | (s[3] << 4) | (s[4] >> 5); d[5] = 0x04 | (s[4] << 3) | (s[5] >> 6); d[6] = 0x02 | (s[5] << 2) | (s[6] >> 7); d[7] = 0x01 | (s[6] << 1); d[8] = (s[7] ); } // Pre-clear the next byte to be written - the slow loop below assumes it is continuing a middle of byte sequence // N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding dst[dstByte] = 0; } // Fill tail data bytes, slow path while(src < size8BitTask) { uint8_t data = task->data[src++]; // High 1 bit to indicate a data byte dst[dstByte] |= 1 << (7 - dstBitsUsed); ++dstBitsUsed; if (dstBitsUsed == 8) // Written data bit completes a full byte? { ++dstByte; // Advance to next byte dstBitsUsed = 0; // Now we are aligned, so can write the data byte directly dst[dstByte++] = data; dst[dstByte] = 0; // Clear old contents of the next byte to write } else { // 8 data bits dst[dstByte++] |= data >> dstBitsUsed; // This is the first write to the next byte, that should occur without ORring to clear old data in memory // N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding dst[dstByte] = data << (8 - dstBitsUsed); } } #if 0 // Enable to debug correctness: #define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c" #define BYTE_TO_BINARY(byte) \ (byte & 0x80 ? '1' : '0'), \ (byte & 0x40 ? '1' : '0'), \ (byte & 0x20 ? '1' : '0'), \ (byte & 0x10 ? '1' : '0'), \ (byte & 0x08 ? '1' : '0'), \ (byte & 0x04 ? '1' : '0'), \ (byte & 0x02 ? '1' : '0'), \ (byte & 0x01 ? '1' : '0') printf("Interleaving result: 8-bit task of size %d bytes became %d bytes:\n", task->size - task->sizeExpandedTaskWithPadding, task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES); printf("8-bit c" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->cmd)); for(int i = 0; i < task->size - task->sizeExpandedTaskWithPadding; ++i) printf("d" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->data[i])); printf("\n9-bit "); for(int i = 0; i < task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; ++i) printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(dst[i])); printf("\n\n"); #endif } void Interleave16BitSPITaskTo32Bit(SPITask *task) { const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding; // 32-bit SPI task lives right at the end of the 16-bit task uint32_t *dst = (uint32_t *)(task->data + size8BitTask); *dst++ = task->cmd; const uint32_t taskSizeU16 = size8BitTask >> 1; uint16_t *src = (uint16_t*)task->data; for(uint32_t i = 0; i < taskSizeU16; ++i) dst[i] = 0x1500 | (src[i] << 16); } #endif // ~SPI_3WIRE_PROTOCOL void WaitForPolledSPITransferToFinish() { uint32_t cs; while (!(((cs = spi->cs) ^ BCM2835_SPI0_CS_TA) & (BCM2835_SPI0_CS_DONE | BCM2835_SPI0_CS_TA))) // While TA=1 and DONE=0 if ((cs & (BCM2835_SPI0_CS_RXR | BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS; if ((cs & BCM2835_SPI0_CS_RXD)) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS; } #ifdef ALL_TASKS_SHOULD_DMA #ifndef USE_DMA_TRANSFERS #error When building with #define ALL_TASKS_SHOULD_DMA enabled, -DUSE_DMA_TRANSFERS=ON should be set in CMake command line! #endif // Synchonously performs a single SPI command byte + N data bytes transfer on the calling thread. Call in between a BEGIN_SPI_COMMUNICATION() and END_SPI_COMMUNICATION() pair. void RunSPITask(SPITask *task) { uint32_t cs; uint8_t *tStart = task->PayloadStart(); uint8_t *tEnd = task->PayloadEnd(); const uint32_t payloadSize = tEnd - tStart; uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize); #define TASK_SIZE_TO_USE_DMA 4 // Do a DMA transfer if this task is suitable in size for DMA to handle if (payloadSize >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y)) { if (previousTaskWasSPI) WaitForPolledSPITransferToFinish(); // printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize()); SPIDMATransfer(task); previousTaskWasSPI = false; } else { if (!previousTaskWasSPI) { WaitForDMAFinished(); spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR_TX | DISPLAY_SPI_DRIVE_SETTINGS; // After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode. UNLOCK_FAST_8_CLOCKS_SPI(); } else WaitForPolledSPITransferToFinish(); // printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize()); // Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above) #ifndef SPI_3WIRE_PROTOCOL CLEAR_GPIO(GPIO_TFT_DATA_CONTROL); #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE // On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands. WRITE_FIFO(0x00); #endif WRITE_FIFO(task->cmd); #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/; spi->fifo; spi->fifo; #else while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/; #endif SET_GPIO(GPIO_TFT_DATA_CONTROL); #endif // Send the data payload: while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++); while(tStart < tEnd) { cs = spi->cs; if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++); // TODO: else asm volatile("yield"); if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS; } previousTaskWasSPI = true; } } #else void RunSPITask(SPITask *task) { WaitForPolledSPITransferToFinish(); // The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go // low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1 // transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout fbcp-ili9341 lifetime, // which is a tiny bit faster. #ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL BEGIN_SPI_COMMUNICATION(); #endif uint8_t *tStart = task->PayloadStart(); uint8_t *tEnd = task->PayloadEnd(); const uint32_t payloadSize = tEnd - tStart; uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize); // Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above) #ifndef SPI_3WIRE_PROTOCOL // An SPI transfer to the display always starts with one control (command) byte, followed by N data bytes. CLEAR_GPIO(GPIO_TFT_DATA_CONTROL); #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE // On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands. WRITE_FIFO(0x00); #endif WRITE_FIFO(task->cmd); #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/; spi->fifo; spi->fifo; #else while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/; #endif SET_GPIO(GPIO_TFT_DATA_CONTROL); #endif // ~!SPI_3WIRE_PROTOCOL // For small transfers, using DMA is not worth it, but pushing through with polled SPI gives better bandwidth. // For larger transfers though that are more than this amount of bytes, using DMA is faster. // This cutoff number was experimentally tested to find where Polled SPI and DMA are as fast. #define DMA_IS_FASTER_THAN_POLLED_SPI 140 // Do a DMA transfer if this task is suitable in size for DMA to handle #ifdef USE_DMA_TRANSFERS if (tEnd - tStart > DMA_IS_FASTER_THAN_POLLED_SPI) { SPIDMATransfer(task); // After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode. UNLOCK_FAST_8_CLOCKS_SPI(); } else #endif { while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++); while(tStart < tEnd) { uint32_t cs = spi->cs; if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++); // TODO: else asm volatile("yield"); if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS; } } #ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL END_SPI_COMMUNICATION(); #endif } #endif SharedMemory *spiTaskMemory = 0; volatile uint64_t spiThreadIdleUsecs = 0; volatile uint64_t spiThreadSleepStartTime = 0; volatile int spiThreadSleeping = 0; double spiUsecsPerByte; SPITask *GetTask() // Returns the first task in the queue, called in worker thread { uint32_t head = spiTaskMemory->queueHead; uint32_t tail = spiTaskMemory->queueTail; if (head == tail) return 0; SPITask *task = (SPITask*)(spiTaskMemory->buffer + head); if (task->cmd == 0) // Wrapped around? { spiTaskMemory->queueHead = 0; __sync_synchronize(); if (tail == 0) return 0; task = (SPITask*)spiTaskMemory->buffer; } return task; } void DoneTask(SPITask *task) // Frees the first SPI task from the queue, called in worker thread { __atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED); spiTaskMemory->queueHead = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size; __sync_synchronize(); } extern volatile bool programRunning; void ExecuteSPITasks() { #ifndef USE_DMA_TRANSFERS BEGIN_SPI_COMMUNICATION(); #endif { while(programRunning && spiTaskMemory->queueTail != spiTaskMemory->queueHead) { SPITask *task = GetTask(); if (task) { RunSPITask(task); DoneTask(task); } } } #ifndef USE_DMA_TRANSFERS END_SPI_COMMUNICATION(); #endif } #if !defined(KERNEL_MODULE) && defined(USE_SPI_THREAD) pthread_t spiThread; // A worker thread that keeps the SPI bus filled at all times void *spi_thread(void *unused) { #ifdef RUN_WITH_REALTIME_THREAD_PRIORITY SetRealtimeThreadPriority(); #endif while(programRunning) { if (spiTaskMemory->queueTail != spiTaskMemory->queueHead) { ExecuteSPITasks(); } else { #ifdef STATISTICS uint64_t t0 = tick(); spiThreadSleepStartTime = t0; __atomic_store_n(&spiThreadSleeping, 1, __ATOMIC_RELAXED); #endif if (programRunning) syscall(SYS_futex, &spiTaskMemory->queueTail, FUTEX_WAIT, spiTaskMemory->queueHead, 0, 0, 0); // Start sleeping until we get new tasks #ifdef STATISTICS __atomic_store_n(&spiThreadSleeping, 0, __ATOMIC_RELAXED); uint64_t t1 = tick(); __sync_fetch_and_add(&spiThreadIdleUsecs, t1-t0); #endif } } pthread_exit(0); } #endif int InitSPI() { #ifdef KERNEL_MODULE #define BCM2835_PERI_BASE 0x3F000000 #define BCM2835_GPIO_BASE 0x200000 #define BCM2835_SPI0_BASE 0x204000 printk("ioremapping %p\n", (void*)(BCM2835_PERI_BASE+BCM2835_GPIO_BASE)); void *bcm2835 = ioremap(BCM2835_PERI_BASE+BCM2835_GPIO_BASE, 32768); printk("Got bcm address %p\n", bcm2835); if (!bcm2835) FATAL_ERROR("Failed to map BCM2835 address!"); spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE - BCM2835_GPIO_BASE); gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835); #else // Userland version // Memory map GPIO and SPI peripherals for direct access mem_fd = open("/dev/mem", O_RDWR|O_SYNC); if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)"); printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address()); bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address()); if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed"); spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE); gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE); systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine. // TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd) #endif uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/); uint32_t maxBcmCoreTurboSpeed = MailboxRet2(0x00030004/*Get Max Clock Rate*/, 0x4/*CORE*/); // Estimate how many microseconds transferring a single byte over the SPI bus takes? spiUsecsPerByte = 1000000.0 * 8.0/*bits/byte*/ * SPI_BUS_CLOCK_DIVISOR / maxBcmCoreTurboSpeed; printf("BCM core speed: current: %uhz, max turbo: %uhz. SPI CDIV: %d, SPI max frequency: %.0fhz\n", currentBcmCoreSpeed, maxBcmCoreTurboSpeed, SPI_BUS_CLOCK_DIVISOR, (double)maxBcmCoreTurboSpeed / SPI_BUS_CLOCK_DIVISOR); #if !defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES) // By default all GPIO pins are in input mode (0x00), initialize them for SPI and GPIO writes #ifdef GPIO_TFT_DATA_CONTROL SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0x01); // Data/Control pin to output (0x01) #endif SET_GPIO_MODE(GPIO_SPI0_MISO, 0x04); SET_GPIO_MODE(GPIO_SPI0_MOSI, 0x04); SET_GPIO_MODE(GPIO_SPI0_CLK, 0x04); #ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL // The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go // low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1 // transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout // fbcp-ili9341 lifetime, which is a tiny bit faster. SET_GPIO_MODE(GPIO_SPI0_CE0, 0x04); #ifdef DISPLAY_USES_CS1 SET_GPIO_MODE(GPIO_SPI0_CE1, 0x04); #endif #else // Set the SPI 0 pin explicitly to output, and enable chip select on the line by setting it to low. // fbcp-ili9341 assumes exclusive access to the SPI0 bus, and exclusive presence of only one device on the bus, // which is (permanently) activated here. SET_GPIO_MODE(GPIO_SPI0_CE0, 0x01); CLEAR_GPIO(GPIO_SPI0_CE0); #ifdef DISPLAY_USES_CS1 SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01); #endif #endif spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS; // Initialize the Control and Status register to defaults: CS=0 (Chip Select), CPHA=0 (Clock Phase), CPOL=0 (Clock Polarity), CSPOL=0 (Chip Select Polarity), TA=0 (Transfer not active), and reset TX and RX queues. spi->clk = SPI_BUS_CLOCK_DIVISOR; // Clock Divider determines SPI bus speed, resulting speed=256MHz/clk #endif // Initialize SPI thread task buffer memory #ifdef KERNEL_MODULE_CLIENT int driverfd = open("/proc/bcm2835_spi_display_bus", O_RDWR|O_SYNC); if (driverfd < 0) FATAL_ERROR("Could not open SPI ring buffer - kernel driver module not running?"); spiTaskMemory = (SharedMemory*)mmap(NULL, SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED/* | MAP_NORESERVE | MAP_POPULATE | MAP_LOCKED*/, driverfd, 0); close(driverfd); if (spiTaskMemory == MAP_FAILED) FATAL_ERROR("Could not mmap SPI ring buffer!"); printf("Got shared memory block %p, ring buffer head %p, ring buffer tail %p, shared memory block phys address: %p\n", (const char *)spiTaskMemory, spiTaskMemory->queueHead, spiTaskMemory->queueTail, spiTaskMemory->sharedMemoryBaseInPhysMemory); #ifdef USE_DMA_TRANSFERS printf("DMA TX channel: %d, DMA RX channel: %d\n", spiTaskMemory->dmaTxChannel, spiTaskMemory->dmaRxChannel); #endif #else #ifdef KERNEL_MODULE spiTaskMemory = (SharedMemory*)kmalloc(SHARED_MEMORY_SIZE, GFP_KERNEL | GFP_DMA); // TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer in 'spiTaskMemory'. However // that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent. // Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to the following intermediate 'dmaSourceMemory' // memory area to perform the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit. dmaSourceMemory = (SharedMemory*)dma_alloc_writecombine(0, SHARED_MEMORY_SIZE, &spiTaskMemoryPhysical, GFP_KERNEL); LOG("Allocated DMA memory: mem: %p, phys: %p", spiTaskMemory, (void*)spiTaskMemoryPhysical); memset((void*)spiTaskMemory, 0, SHARED_MEMORY_SIZE); #else spiTaskMemory = (SharedMemory*)Malloc(SHARED_MEMORY_SIZE, "spi.cpp shared task memory"); #endif spiTaskMemory->queueHead = spiTaskMemory->queueTail = spiTaskMemory->spiBytesQueued = 0; #endif #ifdef USE_DMA_TRANSFERS InitDMA(); #endif // Enable fast 8 clocks per byte transfer mode, instead of slower 9 clocks per byte. UNLOCK_FAST_8_CLOCKS_SPI(); #if !defined(KERNEL_MODULE) && (!defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES)) printf("Initializing display\n"); InitSPIDisplay(); #ifdef USE_SPI_THREAD // Create a dedicated thread to feed the SPI bus. While this is fast, it consumes a lot of CPU. It would be best to replace // this thread with a kernel module that processes the created SPI task queue using interrupts. (while juggling the GPIO D/C line as well) printf("Creating SPI task thread\n"); int rc = pthread_create(&spiThread, NULL, spi_thread, NULL); // After creating the thread, it is assumed to have ownership of the SPI bus, so no SPI chat on the main thread after this. if (rc != 0) FATAL_ERROR("Failed to create SPI thread!"); #else // We will be running SPI tasks continuously from the main thread, so keep SPI Transfer Active throughout the lifetime of the driver. BEGIN_SPI_COMMUNICATION(); #endif #endif LOG("InitSPI done"); return 0; } void DeinitSPI() { #ifdef USE_SPI_THREAD pthread_join(spiThread, NULL); spiThread = (pthread_t)0; #endif DeinitSPIDisplay(); #ifdef USE_DMA_TRANSFERS DeinitDMA(); #endif spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS; #ifndef KERNEL_MODULE_CLIENT #ifdef GPIO_TFT_DATA_CONTROL SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0); #endif SET_GPIO_MODE(GPIO_SPI0_CE1, 0); SET_GPIO_MODE(GPIO_SPI0_CE0, 0); SET_GPIO_MODE(GPIO_SPI0_MISO, 0); SET_GPIO_MODE(GPIO_SPI0_MOSI, 0); SET_GPIO_MODE(GPIO_SPI0_CLK, 0); #endif if (bcm2835) { munmap((void*)bcm2835, bcm_host_get_peripheral_size()); bcm2835 = 0; } if (mem_fd >= 0) { close(mem_fd); mem_fd = -1; } #ifndef KERNEL_MODULE_CLIENT #ifdef KERNEL_MODULE kfree(spiTaskMemory); dma_free_writecombine(0, SHARED_MEMORY_SIZE, dmaSourceMemory, spiTaskMemoryPhysical); spiTaskMemoryPhysical = 0; #else free(spiTaskMemory); #endif #endif spiTaskMemory = 0; }