LCD-show/usr/fbcp-ili9341/diff.cpp

#include "config.h"
#include "diff.h"
#include "util.h"
#include "display.h"
#include "gpu.h"
#include "spi.h"

Span *spans = 0;

#ifdef UPDATE_FRAMES_WITHOUT_DIFFING
// Naive non-diffing functionality: just submit the whole display contents
void NoDiffChangedRectangle(Span *&head)
{
  head = spans;
  head->x = 0;
  head->endX = head->lastScanEndX = gpuFrameWidth;
  head->y = 0;
  head->endY = gpuFrameHeight;
  head->size = gpuFrameWidth*gpuFrameHeight;
  head->next = 0;
}
#endif

#ifdef UPDATE_FRAMES_IN_SINGLE_RECTANGULAR_DIFF
// Coarse diffing of two framebuffers with tight stride, 16 pixels at a time
// Finds the first changed pixel, coarse result aligned down to 8 pixels boundary
static int coarse_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
  uint16_t *endPtr;
  asm volatile(
    "mov r0, %[framebufferEnd]\n" // r0 <- pointer to end of current framebuffer
    "mov r1, %[framebuffer]\n"   // r1 <- current framebuffer
    "mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame

  "start_%=:\n"
    "pld [r1, #128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
    "pld [r2, #128]\n"

    "ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
    "ldmia r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
    "cmp r3, r7\n" // compare all 8 pixels if they are different
    "cmpeq r4, r8\n"
    "cmpeq r5, r9\n"
    "cmpeq r6, r10\n"
    "bne end_%=\n" // if we found a difference, we are done

    // Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
    // of the loop computes a single data cache line, with preloads in place at the top.
    "ldmia r1!, {r3,r4,r5,r6}\n"
    "ldmia r2!, {r7,r8,r9,r10}\n"
    "cmp r3, r7\n"
    "cmpeq r4, r8\n"
    "cmpeq r5, r9\n"
    "cmpeq r6, r10\n"
    "bne end_%=\n" // if we found a difference, we are done

    "cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
    "bne start_%=\n"
    "b done_%=\n"

  "end_%=:\n"
    "sub r1, r1, #16\n" // ldmia r1! increments r1 after load, so subtract back the last increment in order to not shoot past the first changed pixels

  "done_%=:\n"
    "mov %[endPtr], r1\n" // output endPtr back to C code
    : [endPtr]"=r"(endPtr)
    : [framebuffer]"r"(framebuffer), [prevFramebuffer]"r"(prevFramebuffer), [framebufferEnd]"r"(framebufferEnd)
    : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
  );
  return endPtr - framebuffer;
}

// Same as coarse_linear_diff, but finds the last changed pixel in linear order instead of first, i.e.
// Finds the last changed pixel, coarse result aligned up to 8 pixels boundary
static int coarse_backwards_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
  uint16_t *endPtr;
  asm volatile(
    "mov r0, %[framebufferBegin]\n" // r0 <- pointer to beginning of current framebuffer
    "mov r1, %[framebuffer]\n"   // r1 <- current framebuffer (starting from end of framebuffer)
    "mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame (starting from end of framebuffer)

  "start_%=:\n"
    "pld [r1, #-128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
    "pld [r2, #-128]\n"

    "ldmdb r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
    "ldmdb r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
    "cmp r3, r7\n" // compare all 8 pixels if they are different
    "cmpeq r4, r8\n"
    "cmpeq r5, r9\n"
    "cmpeq r6, r10\n"
    "bne end_%=\n" // if we found a difference, we are done

    // Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
    // of the loop computes a single data cache line, with preloads in place at the top.
    "ldmdb r1!, {r3,r4,r5,r6}\n"
    "ldmdb r2!, {r7,r8,r9,r10}\n"
    "cmp r3, r7\n"
    "cmpeq r4, r8\n"
    "cmpeq r5, r9\n"
    "cmpeq r6, r10\n"
    "bne end_%=\n" // if we found a difference, we are done

    "cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
    "bne start_%=\n"
    "b done_%=\n"

  "end_%=:\n"
    "add r1, r1, #16\n" // ldmdb r1! decrements r1 before load, so add back the last decrement in order to not shoot past the first changed pixels

  "done_%=:\n"
    "mov %[endPtr], r1\n" // output endPtr back to C code
    : [endPtr]"=r"(endPtr)
    : [framebuffer]"r"(framebufferEnd), [prevFramebuffer]"r"(prevFramebuffer+(framebufferEnd-framebuffer)), [framebufferBegin]"r"(framebuffer)
    : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
  );
  return endPtr - framebuffer;
}

void DiffFramebuffersToSingleChangedRectangle(uint16_t *framebuffer, uint16_t *prevFramebuffer, Span *&head)
{
  int minY = 0;
  int minX = -1;

  const int stride = gpuFramebufferScanlineStrideBytes>>1; // Stride as uint16 elements.
  const int WidthAligned4 = (uint32_t)gpuFrameWidth & ~3u;

  uint16_t *scanline = framebuffer;
  uint16_t *prevScanline = prevFramebuffer;

  static const bool framebufferSizeCompatibleWithCoarseDiff = gpuFramebufferScanlineStrideBytes == gpuFrameWidth*2 && gpuFramebufferScanlineStrideBytes*gpuFrameHeight % 32 == 0;
  if (framebufferSizeCompatibleWithCoarseDiff)
  {
    int numPixels = gpuFrameWidth*gpuFrameHeight;
    int firstDiff = coarse_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
    if (firstDiff == numPixels)
      return; // No pixels changed, nothing to do.
    // Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
    // Compute the precise diff position here.
    while(framebuffer[firstDiff] == prevFramebuffer[firstDiff]) ++firstDiff;
    minX = firstDiff % gpuFrameWidth;
    minY = firstDiff / gpuFrameWidth;
  }
  else
  {
    while(minY < gpuFrameHeight)
    {
      int x = 0;
      // diff 4 pixels at a time
      for(; x < WidthAligned4; x += 4)
      {
        uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
        if (diff)
        {
          minX = x + (__builtin_ctzll(diff) >> 4);
          goto found_top;
        }
      }
      // tail unaligned 0-3 pixels one by one
      for(; x < gpuFrameWidth; ++x)
      {
        uint16_t diff = *(scanline+x) ^ *(prevScanline+x);
        if (diff)
        {
          minX = x;
          goto found_top;
        }
      }
      scanline += stride;
      prevScanline += stride;
      ++minY;
    }
    return; // No pixels changed, nothing to do.
  }
found_top:

  int maxX = -1;
  int maxY = gpuFrameHeight-1;

  if (framebufferSizeCompatibleWithCoarseDiff)
  {
    int numPixels = gpuFrameWidth*gpuFrameHeight;
    int firstDiff = coarse_backwards_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
    // Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
    // Compute the precise diff position here.
    while(firstDiff > 0 && framebuffer[firstDiff] == prevFramebuffer[firstDiff]) --firstDiff;
    maxX = firstDiff % gpuFrameWidth;
    maxY = firstDiff / gpuFrameWidth;
  }
  else
  {
    scanline = framebuffer + (gpuFrameHeight - 1)*stride;
    prevScanline = prevFramebuffer + (gpuFrameHeight - 1)*stride; // (same scanline from previous frame, not preceding scanline)

    while(maxY >= minY)
    {
      int x = gpuFrameWidth-1;
      // tail unaligned 0-3 pixels one by one
      for(; x >= WidthAligned4; --x)
      {
        if (scanline[x] != prevScanline[x])
        {
          maxX = x;
          goto found_bottom;
        }
      }
      // diff 4 pixels at a time
      x = x & ~3u;
      for(; x >= 0; x -= 4)
      {
        uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
        if (diff)
        {
          maxX = x + 3 - (__builtin_clzll(diff) >> 4);
          goto found_bottom;
        }
      }
      scanline -= stride;
      prevScanline -= stride;
      --maxY;
    }
  }
found_bottom:

  scanline = framebuffer + minY*stride;
  prevScanline = prevFramebuffer + minY*stride;
  int lastScanEndX = maxX;
  if (minX > maxX) SWAPU32(minX, maxX);
  int leftX = 0;
  while(leftX < minX)
  {
    uint16_t *s = scanline + leftX;
    uint16_t *prevS = prevScanline + leftX;
    for(int y = minY; y <= maxY; ++y)
    {
      if (*s != *prevS)
        goto found_left;
      s += stride;
      prevS += stride;
    }
    ++leftX;
  }
found_left:

  int rightX = gpuFrameWidth-1;
  while(rightX > maxX)
  {
    uint16_t *s = scanline + rightX;
    uint16_t *prevS = prevScanline + rightX;
    for(int y = minY; y <= maxY; ++y)
    {
      if (*s != *prevS)
        goto found_right;
      s += stride;
      prevS += stride;
    }
    --rightX;
  }
found_right:

  head = spans;
  head->x = leftX;
  head->endX = rightX+1;
  head->lastScanEndX = lastScanEndX+1;
  head->y = minY;
  head->endY = maxY+1;

#if defined(ALIGN_DIFF_TASKS_FOR_32B_CACHE_LINES) && defined(ALL_TASKS_SHOULD_DMA)
  // Make sure the task is a multiple of 32 bytes wide so we can use a fast DMA copy
  // algorithm later on. Currently this is only exploited in dma.cpp if ALL_TASKS_SHOULD_DMA
  // option is enabled, so only enable it there.
  head->x = MAX(0, ALIGN_DOWN(head->x, 16));
  head->endX = MIN(gpuFrameWidth, ALIGN_UP(head->endX, 16));
  head->lastScanEndX = ALIGN_UP(head->lastScanEndX, 16);
#endif

  head->size = (head->endX-head->x)*(head->endY-head->y-1) + (head->lastScanEndX - head->x);
  head->next = 0;
}
#endif

void DiffFramebuffersToScanlineSpansFastAndCoarse4Wide(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
  int numSpans = 0;
  int y = interlacedDiff ? interlacedFieldParity : 0;
  int yInc = interlacedDiff ? 2 : 1;
  // If doing an interlaced update, skip over every second scanline.
  int scanlineInc = interlacedDiff ? (gpuFramebufferScanlineStrideBytes>>2) : (gpuFramebufferScanlineStrideBytes>>3);
  uint64_t *scanline = (uint64_t *)(framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1));
  uint64_t *prevScanline = (uint64_t *)(prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1)); // (same scanline from previous frame, not preceding scanline)

  const int W = gpuFrameWidth>>2;

  Span *span = spans;
  while(y < gpuFrameHeight)
  {
    uint16_t *scanlineStart = (uint16_t *)scanline;

    for(int x = 0; x < W;)
    {
      if (scanline[x] != prevScanline[x])
      {
        uint16_t *spanStart = (uint16_t *)(scanline + x) + (__builtin_ctzll(scanline[x] ^ prevScanline[x]) >> 4);
        ++x;

        // We've found a start of a span of different pixels on this scanline, now find where this span ends
        uint16_t *spanEnd;
        for(;;)
        {
          if (x < W)
          {
            if (scanline[x] != prevScanline[x])
            {
              ++x;
              continue;
            }
            else
            {
              spanEnd = (uint16_t *)(scanline + x) + 1 - (__builtin_clzll(scanline[x-1] ^ prevScanline[x-1]) >> 4);
              ++x;
              break;
            }
          }
          else
          {
            spanEnd = scanlineStart + gpuFrameWidth;
            break;
          }
        }

        // Submit the span update task
        span->x = spanStart - scanlineStart;
        span->endX = span->lastScanEndX = spanEnd - scanlineStart;
        span->y = y;
        span->endY = y+1;
        span->size = spanEnd - spanStart;
        span->next = span+1;
        ++span;
        ++numSpans;
      }
      else
      {
        ++x;
      }
    }
    y += yInc;
    scanline += scanlineInc;
    prevScanline += scanlineInc;
  }

  if (numSpans > 0)
  {
    head = &spans[0];
    spans[numSpans-1].next = 0;
  }
  else
    head = 0;
}

void DiffFramebuffersToScanlineSpansExact(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
  int numSpans = 0;
  int y = interlacedDiff ? interlacedFieldParity : 0;
  int yInc = interlacedDiff ? 2 : 1;
  // If doing an interlaced update, skip over every second scanline.
  int scanlineInc = interlacedDiff ? gpuFramebufferScanlineStrideBytes : (gpuFramebufferScanlineStrideBytes>>1);
  int scanlineEndInc = scanlineInc - gpuFrameWidth;
  uint16_t *scanline = framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1);
  uint16_t *prevScanline = prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1); // (same scanline from previous frame, not preceding scanline)

  while(y < gpuFrameHeight)
  {
    uint16_t *scanlineStart = scanline;
    uint16_t *scanlineEnd = scanline + gpuFrameWidth;
    while(scanline < scanlineEnd)
    {
      uint16_t *spanStart;
      uint16_t *spanEnd;
      int numConsecutiveUnchangedPixels = 0;

      if (scanline + 1 < scanlineEnd)
      {
        uint32_t diff = (*(uint32_t *)scanline) ^ (*(uint32_t *)prevScanline);
        scanline += 2;
        prevScanline += 2;

        if (diff == 0) // Both 1st and 2nd pixels are the same
          continue;

        if (diff & 0xFFFF == 0) // 1st pixels are the same, 2nd pixels are not
        {
          spanStart = scanline - 1;
          spanEnd = scanline;
        }
        else // 1st pixels are different
        {
          spanStart = scanline - 2;
          if ((diff & 0xFFFF0000u) != 0) // 2nd pixels are different?
          {
            spanEnd = scanline;
          }
          else
          {
            spanEnd = scanline - 1;
            numConsecutiveUnchangedPixels = 1;
          }
        }

        // We've found a start of a span of different pixels on this scanline, now find where this span ends
        while(scanline < scanlineEnd)
        {
          if (*scanline++ != *prevScanline++)
          {
            spanEnd = scanline;
            numConsecutiveUnchangedPixels = 0;
          }
          else
          {
            if (++numConsecutiveUnchangedPixels > SPAN_MERGE_THRESHOLD)
              break;
          }
        }
      }
      else // handle the single last pixel on the row
      {
        if (*scanline++ == *prevScanline++)
          break;

        spanStart = scanline - 1;
        spanEnd = scanline;
      }

      // Submit the span update task
      Span *span = spans + numSpans;
      span->x = spanStart - scanlineStart;
      span->endX = span->lastScanEndX = spanEnd - scanlineStart;
      span->y = y;
      span->endY = y+1;
      span->size = spanEnd - spanStart;
      if (numSpans > 0) span[-1].next = span;
      else head = span;
      span->next = 0;
      ++numSpans;
    }
    y += yInc;
    scanline += scanlineEndInc;
    prevScanline += scanlineEndInc;
  }
}

void MergeScanlineSpanList(Span *listHead)
{
  for(Span *i = listHead; i; i = i->next)
  {
    Span *prev = i;
    for(Span *j = i->next; j; j = j->next)
    {
      // If the spans i and j are vertically apart, don't attempt to merge span i any further, since all spans >= j will also be farther vertically apart.
      // (the list is nondecreasing with respect to Span::y)
      if (j->y > i->endY) break;

      // Merge the spans i and j, and figure out the wastage of doing so
      int x = MIN(i->x, j->x);
      int y = MIN(i->y, j->y);
      int endX = MAX(i->endX, j->endX);
      int endY = MAX(i->endY, j->endY);
      int lastScanEndX = (endY > i->endY) ? j->lastScanEndX : ((endY > j->endY) ? i->lastScanEndX : MAX(i->lastScanEndX, j->lastScanEndX));
      int newSize = (endX-x)*(endY-y-1) + (lastScanEndX - x);
      int wastedPixels = newSize - i->size - j->size;
      if (wastedPixels <= SPAN_MERGE_THRESHOLD
#ifdef MAX_SPI_TASK_SIZE
        && newSize*SPI_BYTESPERPIXEL <= MAX_SPI_TASK_SIZE
#endif
      )
      {
        i->x = x;
        i->y = y;
        i->endX = endX;
        i->endY = endY;
        i->lastScanEndX = lastScanEndX;
        i->size = newSize;
        prev->next = j->next;
        j = prev;
      }
      else // Not merging - travel to next node remembering where we came from
        prev = j;
    }
  }
}
add 1.54 IPS driver 2021-01-27 08:18:51 +00:00			`#include "config.h"`
			`#include "diff.h"`
			`#include "util.h"`
			`#include "display.h"`
			`#include "gpu.h"`
			`#include "spi.h"`

			`Span *spans = 0;`

			`#ifdef UPDATE_FRAMES_WITHOUT_DIFFING`
			`// Naive non-diffing functionality: just submit the whole display contents`
			`void NoDiffChangedRectangle(Span *&head)`
			`{`
			`head = spans;`
			`head->x = 0;`
			`head->endX = head->lastScanEndX = gpuFrameWidth;`
			`head->y = 0;`
			`head->endY = gpuFrameHeight;`
			`head->size = gpuFrameWidth*gpuFrameHeight;`
			`head->next = 0;`
			`}`
			`#endif`

			`#ifdef UPDATE_FRAMES_IN_SINGLE_RECTANGULAR_DIFF`
			`// Coarse diffing of two framebuffers with tight stride, 16 pixels at a time`
			`// Finds the first changed pixel, coarse result aligned down to 8 pixels boundary`
			`static int coarse_linear_diff(uint16_t framebuffer, uint16_t prevFramebuffer, uint16_t *framebufferEnd)`
			`{`
			`uint16_t *endPtr;`
			`asm volatile(`
			`"mov r0, %[framebufferEnd]\n" // r0 <- pointer to end of current framebuffer`
			`"mov r1, %[framebuffer]\n" // r1 <- current framebuffer`
			`"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame`

			`"start_%=:\n"`
			`"pld [r1, #128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time`
			`"pld [r2, #128]\n"`

			`"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer`
			`"ldmia r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer`
			`"cmp r3, r7\n" // compare all 8 pixels if they are different`
			`"cmpeq r4, r8\n"`
			`"cmpeq r5, r9\n"`
			`"cmpeq r6, r10\n"`
			`"bne end_%=\n" // if we found a difference, we are done`

			`// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration`
			`// of the loop computes a single data cache line, with preloads in place at the top.`
			`"ldmia r1!, {r3,r4,r5,r6}\n"`
			`"ldmia r2!, {r7,r8,r9,r10}\n"`
			`"cmp r3, r7\n"`
			`"cmpeq r4, r8\n"`
			`"cmpeq r5, r9\n"`
			`"cmpeq r6, r10\n"`
			`"bne end_%=\n" // if we found a difference, we are done`

			`"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?`
			`"bne start_%=\n"`
			`"b done_%=\n"`

			`"end_%=:\n"`
			`"sub r1, r1, #16\n" // ldmia r1! increments r1 after load, so subtract back the last increment in order to not shoot past the first changed pixels`

			`"done_%=:\n"`
			`"mov %[endPtr], r1\n" // output endPtr back to C code`
			`: [endPtr]"=r"(endPtr)`
			`: [framebuffer]"r"(framebuffer), [prevFramebuffer]"r"(prevFramebuffer), [framebufferEnd]"r"(framebufferEnd)`
			`: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"`
			`);`
			`return endPtr - framebuffer;`
			`}`

			`// Same as coarse_linear_diff, but finds the last changed pixel in linear order instead of first, i.e.`
			`// Finds the last changed pixel, coarse result aligned up to 8 pixels boundary`
			`static int coarse_backwards_linear_diff(uint16_t framebuffer, uint16_t prevFramebuffer, uint16_t *framebufferEnd)`
			`{`
			`uint16_t *endPtr;`
			`asm volatile(`
			`"mov r0, %[framebufferBegin]\n" // r0 <- pointer to beginning of current framebuffer`
			`"mov r1, %[framebuffer]\n" // r1 <- current framebuffer (starting from end of framebuffer)`
			`"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame (starting from end of framebuffer)`

			`"start_%=:\n"`
			`"pld [r1, #-128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time`
			`"pld [r2, #-128]\n"`

			`"ldmdb r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer`
			`"ldmdb r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer`
			`"cmp r3, r7\n" // compare all 8 pixels if they are different`
			`"cmpeq r4, r8\n"`
			`"cmpeq r5, r9\n"`
			`"cmpeq r6, r10\n"`
			`"bne end_%=\n" // if we found a difference, we are done`

			`// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration`
			`// of the loop computes a single data cache line, with preloads in place at the top.`
			`"ldmdb r1!, {r3,r4,r5,r6}\n"`
			`"ldmdb r2!, {r7,r8,r9,r10}\n"`
			`"cmp r3, r7\n"`
			`"cmpeq r4, r8\n"`
			`"cmpeq r5, r9\n"`
			`"cmpeq r6, r10\n"`
			`"bne end_%=\n" // if we found a difference, we are done`

			`"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?`
			`"bne start_%=\n"`
			`"b done_%=\n"`

			`"end_%=:\n"`
			`"add r1, r1, #16\n" // ldmdb r1! decrements r1 before load, so add back the last decrement in order to not shoot past the first changed pixels`

			`"done_%=:\n"`
			`"mov %[endPtr], r1\n" // output endPtr back to C code`
			`: [endPtr]"=r"(endPtr)`
			`: [framebuffer]"r"(framebufferEnd), [prevFramebuffer]"r"(prevFramebuffer+(framebufferEnd-framebuffer)), [framebufferBegin]"r"(framebuffer)`
			`: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"`
			`);`
			`return endPtr - framebuffer;`
			`}`

			`void DiffFramebuffersToSingleChangedRectangle(uint16_t framebuffer, uint16_t prevFramebuffer, Span *&head)`
			`{`
			`int minY = 0;`
			`int minX = -1;`

			`const int stride = gpuFramebufferScanlineStrideBytes>>1; // Stride as uint16 elements.`
			`const int WidthAligned4 = (uint32_t)gpuFrameWidth & ~3u;`

			`uint16_t *scanline = framebuffer;`
			`uint16_t *prevScanline = prevFramebuffer;`

			`static const bool framebufferSizeCompatibleWithCoarseDiff = gpuFramebufferScanlineStrideBytes == gpuFrameWidth2 && gpuFramebufferScanlineStrideBytesgpuFrameHeight % 32 == 0;`
			`if (framebufferSizeCompatibleWithCoarseDiff)`
			`{`
			`int numPixels = gpuFrameWidth*gpuFrameHeight;`
			`int firstDiff = coarse_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);`
			`if (firstDiff == numPixels)`
			`return; // No pixels changed, nothing to do.`
			`// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.`
			`// Compute the precise diff position here.`
			`while(framebuffer[firstDiff] == prevFramebuffer[firstDiff]) ++firstDiff;`
			`minX = firstDiff % gpuFrameWidth;`
			`minY = firstDiff / gpuFrameWidth;`
			`}`
			`else`
			`{`
			`while(minY < gpuFrameHeight)`
			`{`
			`int x = 0;`
			`// diff 4 pixels at a time`
			`for(; x < WidthAligned4; x += 4)`
			`{`
			`uint64_t diff = (uint64_t)(scanline+x) ^ (uint64_t)(prevScanline+x);`
			`if (diff)`
			`{`
			`minX = x + (__builtin_ctzll(diff) >> 4);`
			`goto found_top;`
			`}`
			`}`
			`// tail unaligned 0-3 pixels one by one`
			`for(; x < gpuFrameWidth; ++x)`
			`{`
			`uint16_t diff = (scanline+x) ^ (prevScanline+x);`
			`if (diff)`
			`{`
			`minX = x;`
			`goto found_top;`
			`}`
			`}`
			`scanline += stride;`
			`prevScanline += stride;`
			`++minY;`
			`}`
			`return; // No pixels changed, nothing to do.`
			`}`
			`found_top:`

			`int maxX = -1;`
			`int maxY = gpuFrameHeight-1;`

			`if (framebufferSizeCompatibleWithCoarseDiff)`
			`{`
			`int numPixels = gpuFrameWidth*gpuFrameHeight;`
			`int firstDiff = coarse_backwards_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);`
			`// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.`
			`// Compute the precise diff position here.`
			`while(firstDiff > 0 && framebuffer[firstDiff] == prevFramebuffer[firstDiff]) --firstDiff;`
			`maxX = firstDiff % gpuFrameWidth;`
			`maxY = firstDiff / gpuFrameWidth;`
			`}`
			`else`
			`{`
			`scanline = framebuffer + (gpuFrameHeight - 1)*stride;`
			`prevScanline = prevFramebuffer + (gpuFrameHeight - 1)*stride; // (same scanline from previous frame, not preceding scanline)`

			`while(maxY >= minY)`
			`{`
			`int x = gpuFrameWidth-1;`
			`// tail unaligned 0-3 pixels one by one`
			`for(; x >= WidthAligned4; --x)`
			`{`
			`if (scanline[x] != prevScanline[x])`
			`{`
			`maxX = x;`
			`goto found_bottom;`
			`}`
			`}`
			`// diff 4 pixels at a time`
			`x = x & ~3u;`
			`for(; x >= 0; x -= 4)`
			`{`
			`uint64_t diff = (uint64_t)(scanline+x) ^ (uint64_t)(prevScanline+x);`
			`if (diff)`
			`{`
			`maxX = x + 3 - (__builtin_clzll(diff) >> 4);`
			`goto found_bottom;`
			`}`
			`}`
			`scanline -= stride;`
			`prevScanline -= stride;`
			`--maxY;`
			`}`
			`}`
			`found_bottom:`

			`scanline = framebuffer + minY*stride;`
			`prevScanline = prevFramebuffer + minY*stride;`
			`int lastScanEndX = maxX;`
			`if (minX > maxX) SWAPU32(minX, maxX);`
			`int leftX = 0;`
			`while(leftX < minX)`
			`{`
			`uint16_t *s = scanline + leftX;`
			`uint16_t *prevS = prevScanline + leftX;`
			`for(int y = minY; y <= maxY; ++y)`
			`{`
			`if (s != prevS)`
			`goto found_left;`
			`s += stride;`
			`prevS += stride;`
			`}`
			`++leftX;`
			`}`
			`found_left:`

			`int rightX = gpuFrameWidth-1;`
			`while(rightX > maxX)`
			`{`
			`uint16_t *s = scanline + rightX;`
			`uint16_t *prevS = prevScanline + rightX;`
			`for(int y = minY; y <= maxY; ++y)`
			`{`
			`if (s != prevS)`
			`goto found_right;`
			`s += stride;`
			`prevS += stride;`
			`}`
			`--rightX;`
			`}`
			`found_right:`

			`head = spans;`
			`head->x = leftX;`
			`head->endX = rightX+1;`
			`head->lastScanEndX = lastScanEndX+1;`
			`head->y = minY;`
			`head->endY = maxY+1;`

			`#if defined(ALIGN_DIFF_TASKS_FOR_32B_CACHE_LINES) && defined(ALL_TASKS_SHOULD_DMA)`
			`// Make sure the task is a multiple of 32 bytes wide so we can use a fast DMA copy`
			`// algorithm later on. Currently this is only exploited in dma.cpp if ALL_TASKS_SHOULD_DMA`
			`// option is enabled, so only enable it there.`
			`head->x = MAX(0, ALIGN_DOWN(head->x, 16));`
			`head->endX = MIN(gpuFrameWidth, ALIGN_UP(head->endX, 16));`
			`head->lastScanEndX = ALIGN_UP(head->lastScanEndX, 16);`
			`#endif`

			`head->size = (head->endX-head->x)*(head->endY-head->y-1) + (head->lastScanEndX - head->x);`
			`head->next = 0;`
			`}`
			`#endif`

			`void DiffFramebuffersToScanlineSpansFastAndCoarse4Wide(uint16_t framebuffer, uint16_t prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)`
			`{`
			`int numSpans = 0;`
			`int y = interlacedDiff ? interlacedFieldParity : 0;`
			`int yInc = interlacedDiff ? 2 : 1;`
			`// If doing an interlaced update, skip over every second scanline.`
			`int scanlineInc = interlacedDiff ? (gpuFramebufferScanlineStrideBytes>>2) : (gpuFramebufferScanlineStrideBytes>>3);`
			`uint64_t scanline = (uint64_t )(framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1));`
			`uint64_t prevScanline = (uint64_t )(prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1)); // (same scanline from previous frame, not preceding scanline)`

			`const int W = gpuFrameWidth>>2;`

			`Span *span = spans;`
			`while(y < gpuFrameHeight)`
			`{`
			`uint16_t scanlineStart = (uint16_t )scanline;`

			`for(int x = 0; x < W;)`
			`{`
			`if (scanline[x] != prevScanline[x])`
			`{`
			`uint16_t spanStart = (uint16_t )(scanline + x) + (__builtin_ctzll(scanline[x] ^ prevScanline[x]) >> 4);`
			`++x;`

			`// We've found a start of a span of different pixels on this scanline, now find where this span ends`
			`uint16_t *spanEnd;`
			`for(;;)`
			`{`
			`if (x < W)`
			`{`
			`if (scanline[x] != prevScanline[x])`
			`{`
			`++x;`
			`continue;`
			`}`
			`else`
			`{`
			`spanEnd = (uint16_t *)(scanline + x) + 1 - (__builtin_clzll(scanline[x-1] ^ prevScanline[x-1]) >> 4);`
			`++x;`
			`break;`
			`}`
			`}`
			`else`
			`{`
			`spanEnd = scanlineStart + gpuFrameWidth;`
			`break;`
			`}`
			`}`

			`// Submit the span update task`
			`span->x = spanStart - scanlineStart;`
			`span->endX = span->lastScanEndX = spanEnd - scanlineStart;`
			`span->y = y;`
			`span->endY = y+1;`
			`span->size = spanEnd - spanStart;`
			`span->next = span+1;`
			`++span;`
			`++numSpans;`
			`}`
			`else`
			`{`
			`++x;`
			`}`
			`}`
			`y += yInc;`
			`scanline += scanlineInc;`
			`prevScanline += scanlineInc;`
			`}`

			`if (numSpans > 0)`
			`{`
			`head = &spans[0];`
			`spans[numSpans-1].next = 0;`
			`}`
			`else`
			`head = 0;`
			`}`

			`void DiffFramebuffersToScanlineSpansExact(uint16_t framebuffer, uint16_t prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)`
			`{`
			`int numSpans = 0;`
			`int y = interlacedDiff ? interlacedFieldParity : 0;`
			`int yInc = interlacedDiff ? 2 : 1;`
			`// If doing an interlaced update, skip over every second scanline.`
			`int scanlineInc = interlacedDiff ? gpuFramebufferScanlineStrideBytes : (gpuFramebufferScanlineStrideBytes>>1);`
			`int scanlineEndInc = scanlineInc - gpuFrameWidth;`
			`uint16_t scanline = framebuffer + y(gpuFramebufferScanlineStrideBytes>>1);`
			`uint16_t prevScanline = prevFramebuffer + y(gpuFramebufferScanlineStrideBytes>>1); // (same scanline from previous frame, not preceding scanline)`

			`while(y < gpuFrameHeight)`
			`{`
			`uint16_t *scanlineStart = scanline;`
			`uint16_t *scanlineEnd = scanline + gpuFrameWidth;`
			`while(scanline < scanlineEnd)`
			`{`
			`uint16_t *spanStart;`
			`uint16_t *spanEnd;`
			`int numConsecutiveUnchangedPixels = 0;`

			`if (scanline + 1 < scanlineEnd)`
			`{`
			`uint32_t diff = ((uint32_t )scanline) ^ ((uint32_t )prevScanline);`
			`scanline += 2;`
			`prevScanline += 2;`

			`if (diff == 0) // Both 1st and 2nd pixels are the same`
			`continue;`

			`if (diff & 0xFFFF == 0) // 1st pixels are the same, 2nd pixels are not`
			`{`
			`spanStart = scanline - 1;`
			`spanEnd = scanline;`
			`}`
			`else // 1st pixels are different`
			`{`
			`spanStart = scanline - 2;`
			`if ((diff & 0xFFFF0000u) != 0) // 2nd pixels are different?`
			`{`
			`spanEnd = scanline;`
			`}`
			`else`
			`{`
			`spanEnd = scanline - 1;`
			`numConsecutiveUnchangedPixels = 1;`
			`}`
			`}`

			`// We've found a start of a span of different pixels on this scanline, now find where this span ends`
			`while(scanline < scanlineEnd)`
			`{`
			`if (scanline++ != prevScanline++)`
			`{`
			`spanEnd = scanline;`
			`numConsecutiveUnchangedPixels = 0;`
			`}`
			`else`
			`{`
			`if (++numConsecutiveUnchangedPixels > SPAN_MERGE_THRESHOLD)`
			`break;`
			`}`
			`}`
			`}`
			`else // handle the single last pixel on the row`
			`{`
			`if (scanline++ == prevScanline++)`
			`break;`

			`spanStart = scanline - 1;`
			`spanEnd = scanline;`
			`}`

			`// Submit the span update task`
			`Span *span = spans + numSpans;`
			`span->x = spanStart - scanlineStart;`
			`span->endX = span->lastScanEndX = spanEnd - scanlineStart;`
			`span->y = y;`
			`span->endY = y+1;`
			`span->size = spanEnd - spanStart;`
			`if (numSpans > 0) span[-1].next = span;`
			`else head = span;`
			`span->next = 0;`
			`++numSpans;`
			`}`
			`y += yInc;`
			`scanline += scanlineEndInc;`
			`prevScanline += scanlineEndInc;`
			`}`
			`}`

			`void MergeScanlineSpanList(Span *listHead)`
			`{`
			`for(Span *i = listHead; i; i = i->next)`
			`{`
			`Span *prev = i;`
			`for(Span *j = i->next; j; j = j->next)`
			`{`
			`// If the spans i and j are vertically apart, don't attempt to merge span i any further, since all spans >= j will also be farther vertically apart.`
			`// (the list is nondecreasing with respect to Span::y)`
			`if (j->y > i->endY) break;`

			`// Merge the spans i and j, and figure out the wastage of doing so`
			`int x = MIN(i->x, j->x);`
			`int y = MIN(i->y, j->y);`
			`int endX = MAX(i->endX, j->endX);`
			`int endY = MAX(i->endY, j->endY);`
			`int lastScanEndX = (endY > i->endY) ? j->lastScanEndX : ((endY > j->endY) ? i->lastScanEndX : MAX(i->lastScanEndX, j->lastScanEndX));`
			`int newSize = (endX-x)*(endY-y-1) + (lastScanEndX - x);`
			`int wastedPixels = newSize - i->size - j->size;`
			`if (wastedPixels <= SPAN_MERGE_THRESHOLD`
			`#ifdef MAX_SPI_TASK_SIZE`
			`&& newSize*SPI_BYTESPERPIXEL <= MAX_SPI_TASK_SIZE`
			`#endif`
			`)`
			`{`
			`i->x = x;`
			`i->y = y;`
			`i->endX = endX;`
			`i->endY = endY;`
			`i->lastScanEndX = lastScanEndX;`
			`i->size = newSize;`
			`prev->next = j->next;`
			`j = prev;`
			`}`
			`else // Not merging - travel to next node remembering where we came from`
			`prev = j;`
			`}`
			`}`
			`}`