491 lines
16 KiB
C++
Raw Normal View History

2021-01-27 08:18:51 +00:00
#include "config.h"
#include "diff.h"
#include "util.h"
#include "display.h"
#include "gpu.h"
#include "spi.h"
Span *spans = 0;
#ifdef UPDATE_FRAMES_WITHOUT_DIFFING
// Naive non-diffing functionality: just submit the whole display contents
void NoDiffChangedRectangle(Span *&head)
{
head = spans;
head->x = 0;
head->endX = head->lastScanEndX = gpuFrameWidth;
head->y = 0;
head->endY = gpuFrameHeight;
head->size = gpuFrameWidth*gpuFrameHeight;
head->next = 0;
}
#endif
#ifdef UPDATE_FRAMES_IN_SINGLE_RECTANGULAR_DIFF
// Coarse diffing of two framebuffers with tight stride, 16 pixels at a time
// Finds the first changed pixel, coarse result aligned down to 8 pixels boundary
static int coarse_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
uint16_t *endPtr;
asm volatile(
"mov r0, %[framebufferEnd]\n" // r0 <- pointer to end of current framebuffer
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame
"start_%=:\n"
"pld [r1, #128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
"pld [r2, #128]\n"
"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
"ldmia r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
"cmp r3, r7\n" // compare all 8 pixels if they are different
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
// of the loop computes a single data cache line, with preloads in place at the top.
"ldmia r1!, {r3,r4,r5,r6}\n"
"ldmia r2!, {r7,r8,r9,r10}\n"
"cmp r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
"bne start_%=\n"
"b done_%=\n"
"end_%=:\n"
"sub r1, r1, #16\n" // ldmia r1! increments r1 after load, so subtract back the last increment in order to not shoot past the first changed pixels
"done_%=:\n"
"mov %[endPtr], r1\n" // output endPtr back to C code
: [endPtr]"=r"(endPtr)
: [framebuffer]"r"(framebuffer), [prevFramebuffer]"r"(prevFramebuffer), [framebufferEnd]"r"(framebufferEnd)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
);
return endPtr - framebuffer;
}
// Same as coarse_linear_diff, but finds the last changed pixel in linear order instead of first, i.e.
// Finds the last changed pixel, coarse result aligned up to 8 pixels boundary
static int coarse_backwards_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
uint16_t *endPtr;
asm volatile(
"mov r0, %[framebufferBegin]\n" // r0 <- pointer to beginning of current framebuffer
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer (starting from end of framebuffer)
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame (starting from end of framebuffer)
"start_%=:\n"
"pld [r1, #-128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
"pld [r2, #-128]\n"
"ldmdb r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
"ldmdb r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
"cmp r3, r7\n" // compare all 8 pixels if they are different
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
// of the loop computes a single data cache line, with preloads in place at the top.
"ldmdb r1!, {r3,r4,r5,r6}\n"
"ldmdb r2!, {r7,r8,r9,r10}\n"
"cmp r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
"bne start_%=\n"
"b done_%=\n"
"end_%=:\n"
"add r1, r1, #16\n" // ldmdb r1! decrements r1 before load, so add back the last decrement in order to not shoot past the first changed pixels
"done_%=:\n"
"mov %[endPtr], r1\n" // output endPtr back to C code
: [endPtr]"=r"(endPtr)
: [framebuffer]"r"(framebufferEnd), [prevFramebuffer]"r"(prevFramebuffer+(framebufferEnd-framebuffer)), [framebufferBegin]"r"(framebuffer)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
);
return endPtr - framebuffer;
}
void DiffFramebuffersToSingleChangedRectangle(uint16_t *framebuffer, uint16_t *prevFramebuffer, Span *&head)
{
int minY = 0;
int minX = -1;
const int stride = gpuFramebufferScanlineStrideBytes>>1; // Stride as uint16 elements.
const int WidthAligned4 = (uint32_t)gpuFrameWidth & ~3u;
uint16_t *scanline = framebuffer;
uint16_t *prevScanline = prevFramebuffer;
static const bool framebufferSizeCompatibleWithCoarseDiff = gpuFramebufferScanlineStrideBytes == gpuFrameWidth*2 && gpuFramebufferScanlineStrideBytes*gpuFrameHeight % 32 == 0;
if (framebufferSizeCompatibleWithCoarseDiff)
{
int numPixels = gpuFrameWidth*gpuFrameHeight;
int firstDiff = coarse_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
if (firstDiff == numPixels)
return; // No pixels changed, nothing to do.
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
// Compute the precise diff position here.
while(framebuffer[firstDiff] == prevFramebuffer[firstDiff]) ++firstDiff;
minX = firstDiff % gpuFrameWidth;
minY = firstDiff / gpuFrameWidth;
}
else
{
while(minY < gpuFrameHeight)
{
int x = 0;
// diff 4 pixels at a time
for(; x < WidthAligned4; x += 4)
{
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
if (diff)
{
minX = x + (__builtin_ctzll(diff) >> 4);
goto found_top;
}
}
// tail unaligned 0-3 pixels one by one
for(; x < gpuFrameWidth; ++x)
{
uint16_t diff = *(scanline+x) ^ *(prevScanline+x);
if (diff)
{
minX = x;
goto found_top;
}
}
scanline += stride;
prevScanline += stride;
++minY;
}
return; // No pixels changed, nothing to do.
}
found_top:
int maxX = -1;
int maxY = gpuFrameHeight-1;
if (framebufferSizeCompatibleWithCoarseDiff)
{
int numPixels = gpuFrameWidth*gpuFrameHeight;
int firstDiff = coarse_backwards_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
// Compute the precise diff position here.
while(firstDiff > 0 && framebuffer[firstDiff] == prevFramebuffer[firstDiff]) --firstDiff;
maxX = firstDiff % gpuFrameWidth;
maxY = firstDiff / gpuFrameWidth;
}
else
{
scanline = framebuffer + (gpuFrameHeight - 1)*stride;
prevScanline = prevFramebuffer + (gpuFrameHeight - 1)*stride; // (same scanline from previous frame, not preceding scanline)
while(maxY >= minY)
{
int x = gpuFrameWidth-1;
// tail unaligned 0-3 pixels one by one
for(; x >= WidthAligned4; --x)
{
if (scanline[x] != prevScanline[x])
{
maxX = x;
goto found_bottom;
}
}
// diff 4 pixels at a time
x = x & ~3u;
for(; x >= 0; x -= 4)
{
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
if (diff)
{
maxX = x + 3 - (__builtin_clzll(diff) >> 4);
goto found_bottom;
}
}
scanline -= stride;
prevScanline -= stride;
--maxY;
}
}
found_bottom:
scanline = framebuffer + minY*stride;
prevScanline = prevFramebuffer + minY*stride;
int lastScanEndX = maxX;
if (minX > maxX) SWAPU32(minX, maxX);
int leftX = 0;
while(leftX < minX)
{
uint16_t *s = scanline + leftX;
uint16_t *prevS = prevScanline + leftX;
for(int y = minY; y <= maxY; ++y)
{
if (*s != *prevS)
goto found_left;
s += stride;
prevS += stride;
}
++leftX;
}
found_left:
int rightX = gpuFrameWidth-1;
while(rightX > maxX)
{
uint16_t *s = scanline + rightX;
uint16_t *prevS = prevScanline + rightX;
for(int y = minY; y <= maxY; ++y)
{
if (*s != *prevS)
goto found_right;
s += stride;
prevS += stride;
}
--rightX;
}
found_right:
head = spans;
head->x = leftX;
head->endX = rightX+1;
head->lastScanEndX = lastScanEndX+1;
head->y = minY;
head->endY = maxY+1;
#if defined(ALIGN_DIFF_TASKS_FOR_32B_CACHE_LINES) && defined(ALL_TASKS_SHOULD_DMA)
// Make sure the task is a multiple of 32 bytes wide so we can use a fast DMA copy
// algorithm later on. Currently this is only exploited in dma.cpp if ALL_TASKS_SHOULD_DMA
// option is enabled, so only enable it there.
head->x = MAX(0, ALIGN_DOWN(head->x, 16));
head->endX = MIN(gpuFrameWidth, ALIGN_UP(head->endX, 16));
head->lastScanEndX = ALIGN_UP(head->lastScanEndX, 16);
#endif
head->size = (head->endX-head->x)*(head->endY-head->y-1) + (head->lastScanEndX - head->x);
head->next = 0;
}
#endif
void DiffFramebuffersToScanlineSpansFastAndCoarse4Wide(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
int numSpans = 0;
int y = interlacedDiff ? interlacedFieldParity : 0;
int yInc = interlacedDiff ? 2 : 1;
// If doing an interlaced update, skip over every second scanline.
int scanlineInc = interlacedDiff ? (gpuFramebufferScanlineStrideBytes>>2) : (gpuFramebufferScanlineStrideBytes>>3);
uint64_t *scanline = (uint64_t *)(framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1));
uint64_t *prevScanline = (uint64_t *)(prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1)); // (same scanline from previous frame, not preceding scanline)
const int W = gpuFrameWidth>>2;
Span *span = spans;
while(y < gpuFrameHeight)
{
uint16_t *scanlineStart = (uint16_t *)scanline;
for(int x = 0; x < W;)
{
if (scanline[x] != prevScanline[x])
{
uint16_t *spanStart = (uint16_t *)(scanline + x) + (__builtin_ctzll(scanline[x] ^ prevScanline[x]) >> 4);
++x;
// We've found a start of a span of different pixels on this scanline, now find where this span ends
uint16_t *spanEnd;
for(;;)
{
if (x < W)
{
if (scanline[x] != prevScanline[x])
{
++x;
continue;
}
else
{
spanEnd = (uint16_t *)(scanline + x) + 1 - (__builtin_clzll(scanline[x-1] ^ prevScanline[x-1]) >> 4);
++x;
break;
}
}
else
{
spanEnd = scanlineStart + gpuFrameWidth;
break;
}
}
// Submit the span update task
span->x = spanStart - scanlineStart;
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
span->y = y;
span->endY = y+1;
span->size = spanEnd - spanStart;
span->next = span+1;
++span;
++numSpans;
}
else
{
++x;
}
}
y += yInc;
scanline += scanlineInc;
prevScanline += scanlineInc;
}
if (numSpans > 0)
{
head = &spans[0];
spans[numSpans-1].next = 0;
}
else
head = 0;
}
void DiffFramebuffersToScanlineSpansExact(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
int numSpans = 0;
int y = interlacedDiff ? interlacedFieldParity : 0;
int yInc = interlacedDiff ? 2 : 1;
// If doing an interlaced update, skip over every second scanline.
int scanlineInc = interlacedDiff ? gpuFramebufferScanlineStrideBytes : (gpuFramebufferScanlineStrideBytes>>1);
int scanlineEndInc = scanlineInc - gpuFrameWidth;
uint16_t *scanline = framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1);
uint16_t *prevScanline = prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1); // (same scanline from previous frame, not preceding scanline)
while(y < gpuFrameHeight)
{
uint16_t *scanlineStart = scanline;
uint16_t *scanlineEnd = scanline + gpuFrameWidth;
while(scanline < scanlineEnd)
{
uint16_t *spanStart;
uint16_t *spanEnd;
int numConsecutiveUnchangedPixels = 0;
if (scanline + 1 < scanlineEnd)
{
uint32_t diff = (*(uint32_t *)scanline) ^ (*(uint32_t *)prevScanline);
scanline += 2;
prevScanline += 2;
if (diff == 0) // Both 1st and 2nd pixels are the same
continue;
if (diff & 0xFFFF == 0) // 1st pixels are the same, 2nd pixels are not
{
spanStart = scanline - 1;
spanEnd = scanline;
}
else // 1st pixels are different
{
spanStart = scanline - 2;
if ((diff & 0xFFFF0000u) != 0) // 2nd pixels are different?
{
spanEnd = scanline;
}
else
{
spanEnd = scanline - 1;
numConsecutiveUnchangedPixels = 1;
}
}
// We've found a start of a span of different pixels on this scanline, now find where this span ends
while(scanline < scanlineEnd)
{
if (*scanline++ != *prevScanline++)
{
spanEnd = scanline;
numConsecutiveUnchangedPixels = 0;
}
else
{
if (++numConsecutiveUnchangedPixels > SPAN_MERGE_THRESHOLD)
break;
}
}
}
else // handle the single last pixel on the row
{
if (*scanline++ == *prevScanline++)
break;
spanStart = scanline - 1;
spanEnd = scanline;
}
// Submit the span update task
Span *span = spans + numSpans;
span->x = spanStart - scanlineStart;
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
span->y = y;
span->endY = y+1;
span->size = spanEnd - spanStart;
if (numSpans > 0) span[-1].next = span;
else head = span;
span->next = 0;
++numSpans;
}
y += yInc;
scanline += scanlineEndInc;
prevScanline += scanlineEndInc;
}
}
void MergeScanlineSpanList(Span *listHead)
{
for(Span *i = listHead; i; i = i->next)
{
Span *prev = i;
for(Span *j = i->next; j; j = j->next)
{
// If the spans i and j are vertically apart, don't attempt to merge span i any further, since all spans >= j will also be farther vertically apart.
// (the list is nondecreasing with respect to Span::y)
if (j->y > i->endY) break;
// Merge the spans i and j, and figure out the wastage of doing so
int x = MIN(i->x, j->x);
int y = MIN(i->y, j->y);
int endX = MAX(i->endX, j->endX);
int endY = MAX(i->endY, j->endY);
int lastScanEndX = (endY > i->endY) ? j->lastScanEndX : ((endY > j->endY) ? i->lastScanEndX : MAX(i->lastScanEndX, j->lastScanEndX));
int newSize = (endX-x)*(endY-y-1) + (lastScanEndX - x);
int wastedPixels = newSize - i->size - j->size;
if (wastedPixels <= SPAN_MERGE_THRESHOLD
#ifdef MAX_SPI_TASK_SIZE
&& newSize*SPI_BYTESPERPIXEL <= MAX_SPI_TASK_SIZE
#endif
)
{
i->x = x;
i->y = y;
i->endX = endX;
i->endY = endY;
i->lastScanEndX = lastScanEndX;
i->size = newSize;
prev->next = j->next;
j = prev;
}
else // Not merging - travel to next node remembering where we came from
prev = j;
}
}
}