mirror of
https://github.com/goodtft/LCD-show.git
synced 2025-02-01 03:52:10 +01:00
491 lines
16 KiB
C++
491 lines
16 KiB
C++
#include "config.h"
|
|
#include "diff.h"
|
|
#include "util.h"
|
|
#include "display.h"
|
|
#include "gpu.h"
|
|
#include "spi.h"
|
|
|
|
Span *spans = 0;
|
|
|
|
#ifdef UPDATE_FRAMES_WITHOUT_DIFFING
|
|
// Naive non-diffing functionality: just submit the whole display contents
|
|
void NoDiffChangedRectangle(Span *&head)
|
|
{
|
|
head = spans;
|
|
head->x = 0;
|
|
head->endX = head->lastScanEndX = gpuFrameWidth;
|
|
head->y = 0;
|
|
head->endY = gpuFrameHeight;
|
|
head->size = gpuFrameWidth*gpuFrameHeight;
|
|
head->next = 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef UPDATE_FRAMES_IN_SINGLE_RECTANGULAR_DIFF
|
|
// Coarse diffing of two framebuffers with tight stride, 16 pixels at a time
|
|
// Finds the first changed pixel, coarse result aligned down to 8 pixels boundary
|
|
static int coarse_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
|
|
{
|
|
uint16_t *endPtr;
|
|
asm volatile(
|
|
"mov r0, %[framebufferEnd]\n" // r0 <- pointer to end of current framebuffer
|
|
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer
|
|
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame
|
|
|
|
"start_%=:\n"
|
|
"pld [r1, #128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
|
|
"pld [r2, #128]\n"
|
|
|
|
"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
|
|
"ldmia r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
|
|
"cmp r3, r7\n" // compare all 8 pixels if they are different
|
|
"cmpeq r4, r8\n"
|
|
"cmpeq r5, r9\n"
|
|
"cmpeq r6, r10\n"
|
|
"bne end_%=\n" // if we found a difference, we are done
|
|
|
|
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
|
|
// of the loop computes a single data cache line, with preloads in place at the top.
|
|
"ldmia r1!, {r3,r4,r5,r6}\n"
|
|
"ldmia r2!, {r7,r8,r9,r10}\n"
|
|
"cmp r3, r7\n"
|
|
"cmpeq r4, r8\n"
|
|
"cmpeq r5, r9\n"
|
|
"cmpeq r6, r10\n"
|
|
"bne end_%=\n" // if we found a difference, we are done
|
|
|
|
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
|
|
"bne start_%=\n"
|
|
"b done_%=\n"
|
|
|
|
"end_%=:\n"
|
|
"sub r1, r1, #16\n" // ldmia r1! increments r1 after load, so subtract back the last increment in order to not shoot past the first changed pixels
|
|
|
|
"done_%=:\n"
|
|
"mov %[endPtr], r1\n" // output endPtr back to C code
|
|
: [endPtr]"=r"(endPtr)
|
|
: [framebuffer]"r"(framebuffer), [prevFramebuffer]"r"(prevFramebuffer), [framebufferEnd]"r"(framebufferEnd)
|
|
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
|
|
);
|
|
return endPtr - framebuffer;
|
|
}
|
|
|
|
// Same as coarse_linear_diff, but finds the last changed pixel in linear order instead of first, i.e.
|
|
// Finds the last changed pixel, coarse result aligned up to 8 pixels boundary
|
|
static int coarse_backwards_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
|
|
{
|
|
uint16_t *endPtr;
|
|
asm volatile(
|
|
"mov r0, %[framebufferBegin]\n" // r0 <- pointer to beginning of current framebuffer
|
|
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer (starting from end of framebuffer)
|
|
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame (starting from end of framebuffer)
|
|
|
|
"start_%=:\n"
|
|
"pld [r1, #-128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
|
|
"pld [r2, #-128]\n"
|
|
|
|
"ldmdb r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
|
|
"ldmdb r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
|
|
"cmp r3, r7\n" // compare all 8 pixels if they are different
|
|
"cmpeq r4, r8\n"
|
|
"cmpeq r5, r9\n"
|
|
"cmpeq r6, r10\n"
|
|
"bne end_%=\n" // if we found a difference, we are done
|
|
|
|
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
|
|
// of the loop computes a single data cache line, with preloads in place at the top.
|
|
"ldmdb r1!, {r3,r4,r5,r6}\n"
|
|
"ldmdb r2!, {r7,r8,r9,r10}\n"
|
|
"cmp r3, r7\n"
|
|
"cmpeq r4, r8\n"
|
|
"cmpeq r5, r9\n"
|
|
"cmpeq r6, r10\n"
|
|
"bne end_%=\n" // if we found a difference, we are done
|
|
|
|
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
|
|
"bne start_%=\n"
|
|
"b done_%=\n"
|
|
|
|
"end_%=:\n"
|
|
"add r1, r1, #16\n" // ldmdb r1! decrements r1 before load, so add back the last decrement in order to not shoot past the first changed pixels
|
|
|
|
"done_%=:\n"
|
|
"mov %[endPtr], r1\n" // output endPtr back to C code
|
|
: [endPtr]"=r"(endPtr)
|
|
: [framebuffer]"r"(framebufferEnd), [prevFramebuffer]"r"(prevFramebuffer+(framebufferEnd-framebuffer)), [framebufferBegin]"r"(framebuffer)
|
|
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
|
|
);
|
|
return endPtr - framebuffer;
|
|
}
|
|
|
|
void DiffFramebuffersToSingleChangedRectangle(uint16_t *framebuffer, uint16_t *prevFramebuffer, Span *&head)
|
|
{
|
|
int minY = 0;
|
|
int minX = -1;
|
|
|
|
const int stride = gpuFramebufferScanlineStrideBytes>>1; // Stride as uint16 elements.
|
|
const int WidthAligned4 = (uint32_t)gpuFrameWidth & ~3u;
|
|
|
|
uint16_t *scanline = framebuffer;
|
|
uint16_t *prevScanline = prevFramebuffer;
|
|
|
|
static const bool framebufferSizeCompatibleWithCoarseDiff = gpuFramebufferScanlineStrideBytes == gpuFrameWidth*2 && gpuFramebufferScanlineStrideBytes*gpuFrameHeight % 32 == 0;
|
|
if (framebufferSizeCompatibleWithCoarseDiff)
|
|
{
|
|
int numPixels = gpuFrameWidth*gpuFrameHeight;
|
|
int firstDiff = coarse_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
|
|
if (firstDiff == numPixels)
|
|
return; // No pixels changed, nothing to do.
|
|
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
|
|
// Compute the precise diff position here.
|
|
while(framebuffer[firstDiff] == prevFramebuffer[firstDiff]) ++firstDiff;
|
|
minX = firstDiff % gpuFrameWidth;
|
|
minY = firstDiff / gpuFrameWidth;
|
|
}
|
|
else
|
|
{
|
|
while(minY < gpuFrameHeight)
|
|
{
|
|
int x = 0;
|
|
// diff 4 pixels at a time
|
|
for(; x < WidthAligned4; x += 4)
|
|
{
|
|
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
|
|
if (diff)
|
|
{
|
|
minX = x + (__builtin_ctzll(diff) >> 4);
|
|
goto found_top;
|
|
}
|
|
}
|
|
// tail unaligned 0-3 pixels one by one
|
|
for(; x < gpuFrameWidth; ++x)
|
|
{
|
|
uint16_t diff = *(scanline+x) ^ *(prevScanline+x);
|
|
if (diff)
|
|
{
|
|
minX = x;
|
|
goto found_top;
|
|
}
|
|
}
|
|
scanline += stride;
|
|
prevScanline += stride;
|
|
++minY;
|
|
}
|
|
return; // No pixels changed, nothing to do.
|
|
}
|
|
found_top:
|
|
|
|
int maxX = -1;
|
|
int maxY = gpuFrameHeight-1;
|
|
|
|
if (framebufferSizeCompatibleWithCoarseDiff)
|
|
{
|
|
int numPixels = gpuFrameWidth*gpuFrameHeight;
|
|
int firstDiff = coarse_backwards_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
|
|
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
|
|
// Compute the precise diff position here.
|
|
while(firstDiff > 0 && framebuffer[firstDiff] == prevFramebuffer[firstDiff]) --firstDiff;
|
|
maxX = firstDiff % gpuFrameWidth;
|
|
maxY = firstDiff / gpuFrameWidth;
|
|
}
|
|
else
|
|
{
|
|
scanline = framebuffer + (gpuFrameHeight - 1)*stride;
|
|
prevScanline = prevFramebuffer + (gpuFrameHeight - 1)*stride; // (same scanline from previous frame, not preceding scanline)
|
|
|
|
while(maxY >= minY)
|
|
{
|
|
int x = gpuFrameWidth-1;
|
|
// tail unaligned 0-3 pixels one by one
|
|
for(; x >= WidthAligned4; --x)
|
|
{
|
|
if (scanline[x] != prevScanline[x])
|
|
{
|
|
maxX = x;
|
|
goto found_bottom;
|
|
}
|
|
}
|
|
// diff 4 pixels at a time
|
|
x = x & ~3u;
|
|
for(; x >= 0; x -= 4)
|
|
{
|
|
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
|
|
if (diff)
|
|
{
|
|
maxX = x + 3 - (__builtin_clzll(diff) >> 4);
|
|
goto found_bottom;
|
|
}
|
|
}
|
|
scanline -= stride;
|
|
prevScanline -= stride;
|
|
--maxY;
|
|
}
|
|
}
|
|
found_bottom:
|
|
|
|
scanline = framebuffer + minY*stride;
|
|
prevScanline = prevFramebuffer + minY*stride;
|
|
int lastScanEndX = maxX;
|
|
if (minX > maxX) SWAPU32(minX, maxX);
|
|
int leftX = 0;
|
|
while(leftX < minX)
|
|
{
|
|
uint16_t *s = scanline + leftX;
|
|
uint16_t *prevS = prevScanline + leftX;
|
|
for(int y = minY; y <= maxY; ++y)
|
|
{
|
|
if (*s != *prevS)
|
|
goto found_left;
|
|
s += stride;
|
|
prevS += stride;
|
|
}
|
|
++leftX;
|
|
}
|
|
found_left:
|
|
|
|
int rightX = gpuFrameWidth-1;
|
|
while(rightX > maxX)
|
|
{
|
|
uint16_t *s = scanline + rightX;
|
|
uint16_t *prevS = prevScanline + rightX;
|
|
for(int y = minY; y <= maxY; ++y)
|
|
{
|
|
if (*s != *prevS)
|
|
goto found_right;
|
|
s += stride;
|
|
prevS += stride;
|
|
}
|
|
--rightX;
|
|
}
|
|
found_right:
|
|
|
|
head = spans;
|
|
head->x = leftX;
|
|
head->endX = rightX+1;
|
|
head->lastScanEndX = lastScanEndX+1;
|
|
head->y = minY;
|
|
head->endY = maxY+1;
|
|
|
|
#if defined(ALIGN_DIFF_TASKS_FOR_32B_CACHE_LINES) && defined(ALL_TASKS_SHOULD_DMA)
|
|
// Make sure the task is a multiple of 32 bytes wide so we can use a fast DMA copy
|
|
// algorithm later on. Currently this is only exploited in dma.cpp if ALL_TASKS_SHOULD_DMA
|
|
// option is enabled, so only enable it there.
|
|
head->x = MAX(0, ALIGN_DOWN(head->x, 16));
|
|
head->endX = MIN(gpuFrameWidth, ALIGN_UP(head->endX, 16));
|
|
head->lastScanEndX = ALIGN_UP(head->lastScanEndX, 16);
|
|
#endif
|
|
|
|
head->size = (head->endX-head->x)*(head->endY-head->y-1) + (head->lastScanEndX - head->x);
|
|
head->next = 0;
|
|
}
|
|
#endif
|
|
|
|
void DiffFramebuffersToScanlineSpansFastAndCoarse4Wide(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
|
|
{
|
|
int numSpans = 0;
|
|
int y = interlacedDiff ? interlacedFieldParity : 0;
|
|
int yInc = interlacedDiff ? 2 : 1;
|
|
// If doing an interlaced update, skip over every second scanline.
|
|
int scanlineInc = interlacedDiff ? (gpuFramebufferScanlineStrideBytes>>2) : (gpuFramebufferScanlineStrideBytes>>3);
|
|
uint64_t *scanline = (uint64_t *)(framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1));
|
|
uint64_t *prevScanline = (uint64_t *)(prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1)); // (same scanline from previous frame, not preceding scanline)
|
|
|
|
const int W = gpuFrameWidth>>2;
|
|
|
|
Span *span = spans;
|
|
while(y < gpuFrameHeight)
|
|
{
|
|
uint16_t *scanlineStart = (uint16_t *)scanline;
|
|
|
|
for(int x = 0; x < W;)
|
|
{
|
|
if (scanline[x] != prevScanline[x])
|
|
{
|
|
uint16_t *spanStart = (uint16_t *)(scanline + x) + (__builtin_ctzll(scanline[x] ^ prevScanline[x]) >> 4);
|
|
++x;
|
|
|
|
// We've found a start of a span of different pixels on this scanline, now find where this span ends
|
|
uint16_t *spanEnd;
|
|
for(;;)
|
|
{
|
|
if (x < W)
|
|
{
|
|
if (scanline[x] != prevScanline[x])
|
|
{
|
|
++x;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
spanEnd = (uint16_t *)(scanline + x) + 1 - (__builtin_clzll(scanline[x-1] ^ prevScanline[x-1]) >> 4);
|
|
++x;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
spanEnd = scanlineStart + gpuFrameWidth;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Submit the span update task
|
|
span->x = spanStart - scanlineStart;
|
|
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
|
|
span->y = y;
|
|
span->endY = y+1;
|
|
span->size = spanEnd - spanStart;
|
|
span->next = span+1;
|
|
++span;
|
|
++numSpans;
|
|
}
|
|
else
|
|
{
|
|
++x;
|
|
}
|
|
}
|
|
y += yInc;
|
|
scanline += scanlineInc;
|
|
prevScanline += scanlineInc;
|
|
}
|
|
|
|
if (numSpans > 0)
|
|
{
|
|
head = &spans[0];
|
|
spans[numSpans-1].next = 0;
|
|
}
|
|
else
|
|
head = 0;
|
|
}
|
|
|
|
void DiffFramebuffersToScanlineSpansExact(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
|
|
{
|
|
int numSpans = 0;
|
|
int y = interlacedDiff ? interlacedFieldParity : 0;
|
|
int yInc = interlacedDiff ? 2 : 1;
|
|
// If doing an interlaced update, skip over every second scanline.
|
|
int scanlineInc = interlacedDiff ? gpuFramebufferScanlineStrideBytes : (gpuFramebufferScanlineStrideBytes>>1);
|
|
int scanlineEndInc = scanlineInc - gpuFrameWidth;
|
|
uint16_t *scanline = framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1);
|
|
uint16_t *prevScanline = prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1); // (same scanline from previous frame, not preceding scanline)
|
|
|
|
while(y < gpuFrameHeight)
|
|
{
|
|
uint16_t *scanlineStart = scanline;
|
|
uint16_t *scanlineEnd = scanline + gpuFrameWidth;
|
|
while(scanline < scanlineEnd)
|
|
{
|
|
uint16_t *spanStart;
|
|
uint16_t *spanEnd;
|
|
int numConsecutiveUnchangedPixels = 0;
|
|
|
|
if (scanline + 1 < scanlineEnd)
|
|
{
|
|
uint32_t diff = (*(uint32_t *)scanline) ^ (*(uint32_t *)prevScanline);
|
|
scanline += 2;
|
|
prevScanline += 2;
|
|
|
|
if (diff == 0) // Both 1st and 2nd pixels are the same
|
|
continue;
|
|
|
|
if (diff & 0xFFFF == 0) // 1st pixels are the same, 2nd pixels are not
|
|
{
|
|
spanStart = scanline - 1;
|
|
spanEnd = scanline;
|
|
}
|
|
else // 1st pixels are different
|
|
{
|
|
spanStart = scanline - 2;
|
|
if ((diff & 0xFFFF0000u) != 0) // 2nd pixels are different?
|
|
{
|
|
spanEnd = scanline;
|
|
}
|
|
else
|
|
{
|
|
spanEnd = scanline - 1;
|
|
numConsecutiveUnchangedPixels = 1;
|
|
}
|
|
}
|
|
|
|
// We've found a start of a span of different pixels on this scanline, now find where this span ends
|
|
while(scanline < scanlineEnd)
|
|
{
|
|
if (*scanline++ != *prevScanline++)
|
|
{
|
|
spanEnd = scanline;
|
|
numConsecutiveUnchangedPixels = 0;
|
|
}
|
|
else
|
|
{
|
|
if (++numConsecutiveUnchangedPixels > SPAN_MERGE_THRESHOLD)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else // handle the single last pixel on the row
|
|
{
|
|
if (*scanline++ == *prevScanline++)
|
|
break;
|
|
|
|
spanStart = scanline - 1;
|
|
spanEnd = scanline;
|
|
}
|
|
|
|
// Submit the span update task
|
|
Span *span = spans + numSpans;
|
|
span->x = spanStart - scanlineStart;
|
|
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
|
|
span->y = y;
|
|
span->endY = y+1;
|
|
span->size = spanEnd - spanStart;
|
|
if (numSpans > 0) span[-1].next = span;
|
|
else head = span;
|
|
span->next = 0;
|
|
++numSpans;
|
|
}
|
|
y += yInc;
|
|
scanline += scanlineEndInc;
|
|
prevScanline += scanlineEndInc;
|
|
}
|
|
}
|
|
|
|
void MergeScanlineSpanList(Span *listHead)
|
|
{
|
|
for(Span *i = listHead; i; i = i->next)
|
|
{
|
|
Span *prev = i;
|
|
for(Span *j = i->next; j; j = j->next)
|
|
{
|
|
// If the spans i and j are vertically apart, don't attempt to merge span i any further, since all spans >= j will also be farther vertically apart.
|
|
// (the list is nondecreasing with respect to Span::y)
|
|
if (j->y > i->endY) break;
|
|
|
|
// Merge the spans i and j, and figure out the wastage of doing so
|
|
int x = MIN(i->x, j->x);
|
|
int y = MIN(i->y, j->y);
|
|
int endX = MAX(i->endX, j->endX);
|
|
int endY = MAX(i->endY, j->endY);
|
|
int lastScanEndX = (endY > i->endY) ? j->lastScanEndX : ((endY > j->endY) ? i->lastScanEndX : MAX(i->lastScanEndX, j->lastScanEndX));
|
|
int newSize = (endX-x)*(endY-y-1) + (lastScanEndX - x);
|
|
int wastedPixels = newSize - i->size - j->size;
|
|
if (wastedPixels <= SPAN_MERGE_THRESHOLD
|
|
#ifdef MAX_SPI_TASK_SIZE
|
|
&& newSize*SPI_BYTESPERPIXEL <= MAX_SPI_TASK_SIZE
|
|
#endif
|
|
)
|
|
{
|
|
i->x = x;
|
|
i->y = y;
|
|
i->endX = endX;
|
|
i->endY = endY;
|
|
i->lastScanEndX = lastScanEndX;
|
|
i->size = newSize;
|
|
prev->next = j->next;
|
|
j = prev;
|
|
}
|
|
else // Not merging - travel to next node remembering where we came from
|
|
prev = j;
|
|
}
|
|
}
|
|
}
|