~eliasnaur/gio

02185461610ec5339bd70007db826a0c3363a439 — Elias Naur 3 years ago 6f2a98c
gpu/shaders: import compute programs from github.com/linebender/piet-gpu

The piet-gpu project is dual licensed under the Apache 2.0 and MIT, and the
shaders themselves are also offered under the UNLICENSE terms. See

https://github.com/linebender/piet-gpu#license-and-contributions, as of commit
72e2dfab3da8ae1adf7a0fb056b71ccbc4cfa29a:

"The piet-gpu project is dual-licensed under both Apache 2.0 and MIT licenses.

In addition, the shaders are provided under the terms of the Unlicense. The
intent is for this research to be used in as broad a context as possible."

Signed-off-by: Elias Naur <mail@eliasnaur.com>
A gpu/shaders/annotated.h => gpu/shaders/annotated.h +239 -0
@@ 0,0 1,239 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct AnnoFillRef {
    uint offset;
};

struct AnnoFillTextureRef {
    uint offset;
};

struct AnnoStrokeRef {
    uint offset;
};

struct AnnoClipRef {
    uint offset;
};

struct AnnotatedRef {
    uint offset;
};

struct AnnoFill {
    vec4 bbox;
    uint rgba_color;
};

#define AnnoFill_size 20

AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
    return AnnoFillRef(ref.offset + index * AnnoFill_size);
}

struct AnnoFillTexture {
    vec4 bbox;
    vec4 mat;
    vec2 translate;
    uvec2 uv_bounds;
};

#define AnnoFillTexture_size 48

AnnoFillTextureRef AnnoFillTexture_index(AnnoFillTextureRef ref, uint index) {
    return AnnoFillTextureRef(ref.offset + index * AnnoFillTexture_size);
}

struct AnnoStroke {
    vec4 bbox;
    uint rgba_color;
    float linewidth;
};

#define AnnoStroke_size 24

AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
    return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
}

struct AnnoClip {
    vec4 bbox;
};

#define AnnoClip_size 16

AnnoClipRef AnnoClip_index(AnnoClipRef ref, uint index) {
    return AnnoClipRef(ref.offset + index * AnnoClip_size);
}

#define Annotated_Nop 0
#define Annotated_Stroke 1
#define Annotated_Fill 2
#define Annotated_FillTexture 3
#define Annotated_BeginClip 4
#define Annotated_EndClip 5
#define Annotated_size 52

AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
    return AnnotatedRef(ref.offset + index * Annotated_size);
}

AnnoFill AnnoFill_read(Alloc a, AnnoFillRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    AnnoFill s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.rgba_color = raw4;
    return s;
}

void AnnoFill_write(Alloc a, AnnoFillRef ref, AnnoFill s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
    write_mem(a, ix + 4, s.rgba_color);
}

AnnoFillTexture AnnoFillTexture_read(Alloc a, AnnoFillTextureRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    uint raw7 = read_mem(a, ix + 7);
    uint raw8 = read_mem(a, ix + 8);
    uint raw9 = read_mem(a, ix + 9);
    uint raw10 = read_mem(a, ix + 10);
    uint raw11 = read_mem(a, ix + 11);
    AnnoFillTexture s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.mat = vec4(uintBitsToFloat(raw4), uintBitsToFloat(raw5), uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.translate = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.uv_bounds = uvec2(raw10, raw11);
    return s;
}

void AnnoFillTexture_write(Alloc a, AnnoFillTextureRef ref, AnnoFillTexture s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
    write_mem(a, ix + 4, floatBitsToUint(s.mat.x));
    write_mem(a, ix + 5, floatBitsToUint(s.mat.y));
    write_mem(a, ix + 6, floatBitsToUint(s.mat.z));
    write_mem(a, ix + 7, floatBitsToUint(s.mat.w));
    write_mem(a, ix + 8, floatBitsToUint(s.translate.x));
    write_mem(a, ix + 9, floatBitsToUint(s.translate.y));
    write_mem(a, ix + 10, s.uv_bounds.x);
    write_mem(a, ix + 11, s.uv_bounds.y);
}

AnnoStroke AnnoStroke_read(Alloc a, AnnoStrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    AnnoStroke s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.rgba_color = raw4;
    s.linewidth = uintBitsToFloat(raw5);
    return s;
}

void AnnoStroke_write(Alloc a, AnnoStrokeRef ref, AnnoStroke s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
    write_mem(a, ix + 4, s.rgba_color);
    write_mem(a, ix + 5, floatBitsToUint(s.linewidth));
}

AnnoClip AnnoClip_read(Alloc a, AnnoClipRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    AnnoClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
}

void AnnoClip_write(Alloc a, AnnoClipRef ref, AnnoClip s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
}

uint Annotated_tag(Alloc a, AnnotatedRef ref) {
    return read_mem(a, ref.offset >> 2);
}

AnnoStroke Annotated_Stroke_read(Alloc a, AnnotatedRef ref) {
    return AnnoStroke_read(a, AnnoStrokeRef(ref.offset + 4));
}

AnnoFill Annotated_Fill_read(Alloc a, AnnotatedRef ref) {
    return AnnoFill_read(a, AnnoFillRef(ref.offset + 4));
}

AnnoFillTexture Annotated_FillTexture_read(Alloc a, AnnotatedRef ref) {
    return AnnoFillTexture_read(a, AnnoFillTextureRef(ref.offset + 4));
}

AnnoClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
}

AnnoClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
}

void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
    write_mem(a, ref.offset >> 2, Annotated_Nop);
}

void Annotated_Stroke_write(Alloc a, AnnotatedRef ref, AnnoStroke s) {
    write_mem(a, ref.offset >> 2, Annotated_Stroke);
    AnnoStroke_write(a, AnnoStrokeRef(ref.offset + 4), s);
}

void Annotated_Fill_write(Alloc a, AnnotatedRef ref, AnnoFill s) {
    write_mem(a, ref.offset >> 2, Annotated_Fill);
    AnnoFill_write(a, AnnoFillRef(ref.offset + 4), s);
}

void Annotated_FillTexture_write(Alloc a, AnnotatedRef ref, AnnoFillTexture s) {
    write_mem(a, ref.offset >> 2, Annotated_FillTexture);
    AnnoFillTexture_write(a, AnnoFillTextureRef(ref.offset + 4), s);
}

void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
    write_mem(a, ref.offset >> 2, Annotated_BeginClip);
    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
}

void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
    write_mem(a, ref.offset >> 2, Annotated_EndClip);
    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
}


A gpu/shaders/backdrop.comp => gpu/shaders/backdrop.comp +108 -0
@@ 0,0 1,108 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Propagation of tile backdrop for filling.
//
// Each thread reads one path element and calculates the number of spanned tiles
// based on the bounding box.
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
// and propagated from the left to the right (prefix summed).
//
// Output state:
//  - Each path element has an array of tiles covering the whole path based on boundig box
//  - Each tile per path element contains the 'backdrop' and a list of subdivided path segments

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
#define BACKDROP_WG (1 << LG_BACKDROP_WG)

layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "annotated.h"
#include "tile.h"

shared uint sh_row_count[BACKDROP_WG];
shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG];

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
    if (element_ix < conf.n_elements) {
        uint tag = Annotated_tag(conf.anno_alloc, ref);
        switch (tag) {
        case Annotated_Fill:
        case Annotated_FillTexture:
        case Annotated_BeginClip:
            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
            Path path = Path_read(conf.tile_alloc, path_ref);
            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
            row_count = path.bbox.w - path.bbox.y;
            // Paths that don't cross tile top edges don't have backdrops.
            // Don't apply the optimization to paths that may cross the y = 0
            // top edge, but clipped to 1 row.
            if (row_count == 1 && path.bbox.y > 0) {
                // Note: this can probably be expanded to width = 2 as
                // long as it doesn't cross the left edge.
                row_count = 0;
            }
            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
            sh_row_alloc[th_ix] = path_alloc;
        }
    }

    sh_row_count[th_ix] = row_count;
    // Prefix sum of sh_row_count
    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
            row_count += sh_row_count[th_ix - (1 << i)];
        }
        barrier();
        sh_row_count[th_ix] = row_count;
    }
    barrier();
    // Work assignment: 1 thread : 1 path element row
    uint total_rows = sh_row_count[BACKDROP_WG - 1];
    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
        // Binary search to find element
        uint el_ix = 0;
        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
            uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
            if (row >= sh_row_count[probe - 1]) {
                el_ix = probe;
            }
        }
        uint width = sh_row_width[el_ix];
        if (width > 0) {
            // Process one row sequentially
            // Read backdrop value per tile and prefix sum it
            Alloc tiles_alloc = sh_row_alloc[el_ix];
            uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
            uint sum = read_mem(tiles_alloc, tile_el_ix);
            for (uint x = 1; x < width; x++) {
                tile_el_ix += 2;
                sum += read_mem(tiles_alloc, tile_el_ix);
                write_mem(tiles_alloc, tile_el_ix, sum);
            }
        }
    }
}

A gpu/shaders/binning.comp => gpu/shaders/binning.comp +152 -0
@@ 0,0 1,152 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The binning stage of the pipeline.
//
// Each workgroup processes N_TILE paths.
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
// based on the path bounding box to bin the paths.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

layout(local_size_x = N_TILE, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "annotated.h"
#include "bins.h"

// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))

// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)

// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    uint my_n_elements = conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;

    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
    if (gl_LocalInvocationID.x == 0) {
        sh_alloc_failed = false;
    }
    barrier();

    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < my_n_elements) {
        tag = Annotated_tag(conf.anno_alloc, ref);
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
    case Annotated_Fill:
    case Annotated_FillTexture:
    case Annotated_Stroke:
    case Annotated_BeginClip:
    case Annotated_EndClip:
        // Note: we take advantage of the fact that these drawing elements
        // have the bbox at the same place in their layout.
        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
        x0 = int(floor(fill.bbox.x * SX));
        y0 = int(floor(fill.bbox.y * SY));
        x1 = int(ceil(fill.bbox.z * SX));
        y1 = int(ceil(fill.bbox.w * SY));
        break;
    }

    // At this point, we run an iterator over the coverage area,
    // trying to keep divergence low.
    // Right now, it's just a bbox, but we'll get finer with
    // segments.
    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
    x0 = clamp(x0, 0, int(width_in_bins));
    x1 = clamp(x1, x0, int(width_in_bins));
    y0 = clamp(y0, 0, int(height_in_bins));
    y1 = clamp(y1, y0, int(height_in_bins));
    if (x0 == x1) y1 = y0;
    int x = x0, y = y0;
    uint my_slice = gl_LocalInvocationID.x / 32;
    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
    while (y < y1) {
        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }

    barrier();
    // Allocate output segments.
    uint element_count = 0;
    for (uint i = 0; i < N_SLICE; i++) {
        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
    Alloc chunk_alloc = new_alloc(0, 0);
    if (element_count != 0) {
        // TODO: aggregate atomic adds (subgroup is probably fastest)
        MallocResult chunk = malloc(element_count * BinInstance_size);
        chunk_alloc = chunk.alloc;
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed) {
            sh_alloc_failed = true;
        }
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
    write_mem(conf.bin_alloc, out_ix, element_count);
    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);

    barrier();
    if (sh_alloc_failed) {
        return;
    }

    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
    x = x0;
    y = y0;
    while (y < y1) {
        uint bin_ix = y * width_in_bins + x;
        uint out_mask = bitmaps[my_slice][bin_ix];
        if ((out_mask & my_mask) != 0) {
            uint idx = bitCount(out_mask & (my_mask - 1));
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
            Alloc out_alloc = sh_chunk_alloc[bin_ix];
            uint out_offset = out_alloc.offset + idx * BinInstance_size;
            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
        }
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }
}

A gpu/shaders/bins.h => gpu/shaders/bins.h +31 -0
@@ 0,0 1,31 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct BinInstanceRef {
    uint offset;
};

struct BinInstance {
    uint element_ix;
};

#define BinInstance_size 4

BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
    return BinInstanceRef(ref.offset + index * BinInstance_size);
}

BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    BinInstance s;
    s.element_ix = raw0;
    return s;
}

void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.element_ix);
}


A gpu/shaders/coarse.comp => gpu/shaders/coarse.comp +430 -0
@@ 0,0 1,430 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The coarse rasterizer stage of the pipeline.
//
// As input we have the ordered partitions of paths from the binning phase and
// the annotated tile list of segments and backdrop per path.
//
// Each workgroup operating on one bin by stream compacting
// the elements corresponding to the bin.
//
// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

layout(local_size_x = N_TILE, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "annotated.h"
#include "bins.h"
#include "tile.h"
#include "ptcl.h"

#define LG_N_PART_READ (7 + LG_WG_FACTOR)
#define N_PART_READ (1 << LG_N_PART_READ)

shared uint sh_elements[N_TILE];

// Number of elements in the partition; prefix sum.
shared uint sh_part_count[N_PART_READ];
shared Alloc sh_part_elements[N_PART_READ];

shared uint sh_bitmaps[N_SLICE][N_TILE];

shared uint sh_tile_count[N_TILE];
// The width of the tile rect for the element, intersected with this bin
shared uint sh_tile_width[N_TILE];
shared uint sh_tile_x0[N_TILE];
shared uint sh_tile_y0[N_TILE];

// These are set up so base + tile_y * stride + tile_x points to a Tile.
shared uint sh_tile_base[N_TILE];
shared uint sh_tile_stride[N_TILE];

#ifdef MEM_DEBUG
// Store allocs only when MEM_DEBUG to save shared memory traffic.
shared Alloc sh_tile_alloc[N_TILE];

void write_tile_alloc(uint el_ix, Alloc a) {
    sh_tile_alloc[el_ix] = a;
}

Alloc read_tile_alloc(uint el_ix) {
    return sh_tile_alloc[el_ix];
}
#else
void write_tile_alloc(uint el_ix, Alloc a) {
    // No-op
}

Alloc read_tile_alloc(uint el_ix) {
    // All memory.
    return new_alloc(0, memory.length()*4);
}
#endif

// Perhaps cmd_alloc should be a global? This is a style question.
bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset < cmd_limit) {
        return true;
    }
    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
    if (new_cmd.failed) {
        return false;
    }
    CmdJump jump = CmdJump(new_cmd.alloc.offset);
    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
    cmd_alloc = new_cmd.alloc;
    cmd_ref = CmdRef(cmd_alloc.offset);
    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    return true;
}

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
    uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint partition_ix = 0;
    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
    uint th_ix = gl_LocalInvocationID.x;

    // Coordinates of top left of bin, in tiles.
    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;

    // Per-tile state
    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
    uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    // The nesting depth of the clip stack
    uint clip_depth = 0;
    // State for the "clip zero" optimization. If it's nonzero, then we are
    // currently in a clip for which the entire tile has an alpha of zero, and
    // the value is the depth after the "begin clip" of that element.
    uint clip_zero_depth = 0;
    // State for the "clip one" optimization. If bit `i` is set, then that means
    // that the clip pushed at depth `i` has an alpha of all one.
    uint clip_one_mask = 0;

    // I'm sure we can figure out how to do this with at least one fewer register...
    // Items up to rd_ix have been read from sh_elements
    uint rd_ix = 0;
    // Items up to wr_ix have been written into sh_elements
    uint wr_ix = 0;
    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
    uint part_start_ix = 0;
    uint ready_ix = 0;

    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
        }

        // parallel read of input partitions
        do {
            if (ready_ix == wr_ix && partition_ix < n_partitions) {
                part_start_ix = ready_ix;
                uint count = 0;
                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                    count = read_mem(conf.bin_alloc, in_ix);
                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    if (th_ix < N_PART_READ) {
                        sh_part_count[th_ix] = count;
                    }
                    barrier();
                    if (th_ix < N_PART_READ) {
                        if (th_ix >= (1 << i)) {
                            count += sh_part_count[th_ix - (1 << i)];
                        }
                    }
                    barrier();
                }
                if (th_ix < N_PART_READ) {
                    sh_part_count[th_ix] = part_start_ix + count;
                }
                barrier();
                ready_ix = sh_part_count[N_PART_READ - 1];
                partition_ix += N_PART_READ;
            }
            // use binary search to find element to read
            uint ix = rd_ix + th_ix;
            if (ix >= wr_ix && ix < ready_ix) {
                uint part_ix = 0;
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
                    if (ix >= sh_part_count[probe - 1]) {
                        part_ix = probe;
                    }
                }
                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
                Alloc bin_alloc = sh_part_elements[part_ix];
                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
                sh_elements[th_ix] = inst.element_ix;
            }
            barrier();

            wr_ix = min(rd_ix + N_TILE, ready_ix);
        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));

        // We've done the merge and filled the buffer.

        // Read one element, compute coverage.
        uint tag = Annotated_Nop;
        uint element_ix;
        AnnotatedRef ref;
        if (th_ix + rd_ix < wr_ix) {
            element_ix = sh_elements[th_ix];
            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
            tag = Annotated_tag(conf.anno_alloc, ref);
        }

        // Bounding box of element in pixel coordinates.
        uint tile_count;
        switch (tag) {
        case Annotated_Fill:
        case Annotated_FillTexture:
        case Annotated_Stroke:
        case Annotated_BeginClip:
        case Annotated_EndClip:
            // We have one "path" for each element, even if the element isn't
            // actually a path (currently EndClip, but images etc in the future).
            uint path_ix = element_ix;
            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
            uint stride = path.bbox.z - path.bbox.x;
            sh_tile_stride[th_ix] = stride;
            int dx = int(path.bbox.x) - int(bin_tile_x);
            int dy = int(path.bbox.y) - int(bin_tile_y);
            int x0 = clamp(dx, 0, N_TILE_X);
            int y0 = clamp(dy, 0, N_TILE_Y);
            int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
            int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
            sh_tile_width[th_ix] = uint(x1 - x0);
            sh_tile_x0[th_ix] = x0;
            sh_tile_y0[th_ix] = y0;
            tile_count = uint(x1 - x0) * uint(y1 - y0);
            // base relative to bin
            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
            sh_tile_base[th_ix] = base;
            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
            write_tile_alloc(th_ix, path_alloc);
            break;
        default:
            tile_count = 0;
            break;
        }

        // Prefix sum of sh_tile_count
        sh_tile_count[th_ix] = tile_count;
        for (uint i = 0; i < LG_N_TILE; i++) {
            barrier();
            if (th_ix >= (1 << i)) {
                tile_count += sh_tile_count[th_ix - (1 << i)];
            }
            barrier();
            sh_tile_count[th_ix] = tile_count;
        }
        barrier();
        uint total_tile_count = sh_tile_count[N_TILE - 1];
        for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
            // Binary search to find element
            uint el_ix = 0;
            for (uint i = 0; i < LG_N_TILE; i++) {
                uint probe = el_ix + ((N_TILE / 2) >> i);
                if (ix >= sh_tile_count[probe - 1]) {
                    el_ix = probe;
                }
            }
            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
            uint tag = Annotated_tag(conf.anno_alloc, ref);
            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
            bool include_tile;
            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
                include_tile = true;
            } else {
                Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
                // Include the path in the tile if
                // - the tile contains at least a segment (tile offset non-zero)
                // - the tile is completely covered (backdrop non-zero)
                include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
                uint el_mask = 1 << (el_ix & 31);
                atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
            }
        }

        barrier();

        // Output non-segment elements for this tile. The thread does a sequential walk
        // through the non-segment elements.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
        while (true) {
            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
                if (bitmap == 0) {
                    continue;
                }
            }
            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
            uint element_ix = sh_elements[element_ref_ix];

            // Clear LSB
            bitmap &= bitmap - 1;

            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
            tag = Annotated_tag(conf.anno_alloc, ref);

            if (clip_zero_depth == 0) {
                switch (tag) {
                case Annotated_Fill:
                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    if (tile.tile.offset != 0) {
                        CmdFill cmd_fill;
                        cmd_fill.tile_ref = tile.tile.offset;
                        cmd_fill.backdrop = tile.backdrop;
                        cmd_fill.rgba_color = fill.rgba_color;
                        Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
                    } else {
                        Cmd_Solid_write(cmd_alloc, cmd_ref, CmdSolid(fill.rgba_color));
                    }
                    cmd_ref.offset += Cmd_size;
                    break;
                case Annotated_FillTexture:
                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    AnnoFillTexture fill_tex = Annotated_FillTexture_read(conf.anno_alloc, ref);
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    if (tile.tile.offset != 0) {
                        CmdFillTexture cmd_fill_tex;
                        cmd_fill_tex.tile_ref = tile.tile.offset;
                        cmd_fill_tex.backdrop = tile.backdrop;
                        cmd_fill_tex.mat = fill_tex.mat;
                        cmd_fill_tex.translate = fill_tex.translate;
                        cmd_fill_tex.uv_bounds = fill_tex.uv_bounds;
                        Cmd_FillTexture_write(cmd_alloc, cmd_ref, cmd_fill_tex);
                    } else {
                        CmdSolidTexture cmd_solid_tex;
                        cmd_solid_tex.mat = fill_tex.mat;
                        cmd_solid_tex.translate = fill_tex.translate;
                        cmd_solid_tex.uv_bounds = fill_tex.uv_bounds;
                        Cmd_SolidTexture_write(cmd_alloc, cmd_ref, cmd_solid_tex);
                    }
                    cmd_ref.offset += Cmd_size;
                    break;
                case Annotated_BeginClip:
                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
                        clip_zero_depth = clip_depth + 1;
                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
                        clip_one_mask |= (1 << clip_depth);
                    } else {
                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                            break;
                        }
                        if (tile.tile.offset != 0) {
                            CmdBeginClip cmd_begin_clip;
                            cmd_begin_clip.tile_ref = tile.tile.offset;
                            cmd_begin_clip.backdrop = tile.backdrop;
                            Cmd_BeginClip_write(cmd_alloc, cmd_ref, cmd_begin_clip);
                        } else {
                            // TODO: here is where a bunch of optimization magic should happen
                            float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
                            Cmd_BeginSolidClip_write(cmd_alloc, cmd_ref, CmdBeginSolidClip(alpha));
                        }
                        cmd_ref.offset += Cmd_size;
                        if (clip_depth < 32) {
                            clip_one_mask &= ~(1 << clip_depth);
                        }
                    }
                    clip_depth++;
                    break;
                case Annotated_EndClip:
                    clip_depth--;
                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                            break;
                        }
                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(1.0));
                        cmd_ref.offset += Cmd_size;
                    }
                    break;
                case Annotated_Stroke:
                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    AnnoStroke stroke = Annotated_Stroke_read(conf.anno_alloc, ref);
                    CmdStroke cmd_stroke;
                    cmd_stroke.tile_ref = tile.tile.offset;
                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
                    cmd_stroke.rgba_color = stroke.rgba_color;
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
                    cmd_ref.offset += Cmd_size;
                    break;
                }
            } else {
                // In "clip zero" state, suppress all drawing
                switch (tag) {
                case Annotated_BeginClip:
                    clip_depth++;
                    break;
                case Annotated_EndClip:
                    if (clip_depth == clip_zero_depth) {
                        clip_zero_depth = 0;
                    }
                    clip_depth--;
                    break;
                }
            }
        }
        barrier();

        rd_ix += N_TILE;
        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
    }
    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
        Cmd_End_write(cmd_alloc, cmd_ref);
    }
}

A gpu/shaders/elements.comp => gpu/shaders/elements.comp +441 -0
@@ 0,0 1,441 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The element processing stage, first in the pipeline.
//
// This stage is primarily about applying transforms and computing bounding
// boxes. It is organized as a scan over the input elements, producing
// annotated output elements.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define N_ROWS 4
#define WG_SIZE 32
#define LG_WG_SIZE 5
#define PARTITION_SIZE (WG_SIZE * N_ROWS)

layout(local_size_x = WG_SIZE, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

layout(set = 0, binding = 2) readonly buffer SceneBuf {
    uint[] scene;
};

// It would be better to use the Vulkan memory model than
// "volatile" but shooting for compatibility here rather
// than doing things right.
layout(set = 0, binding = 3) volatile buffer StateBuf {
    uint part_counter;
    uint[] state;
};

#include "scene.h"
#include "state.h"
#include "annotated.h"
#include "pathseg.h"

#define StateBuf_stride (4 + 2 * State_size)

StateRef state_aggregate_ref(uint partition_ix) {
    return StateRef(4 + partition_ix * StateBuf_stride);
}

StateRef state_prefix_ref(uint partition_ix) {
    return StateRef(4 + partition_ix * StateBuf_stride + State_size);
}

uint state_flag_index(uint partition_ix) {
    return partition_ix * (StateBuf_stride / 4);
}

// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0
#define FLAG_AGGREGATE_READY 1
#define FLAG_PREFIX_READY 2

#define FLAG_SET_LINEWIDTH 1
#define FLAG_SET_BBOX 2
#define FLAG_RESET_BBOX 4

// This is almost like a monoid (the interaction between transformation and
// bounding boxes is approximate)
State combine_state(State a, State b) {
    State c;
    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
        c.bbox = a.bbox;
    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
    {
        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
    }
    // It would be more concise to cast to matrix types; ah well.
    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    c.path_count = a.path_count + b.path_count;
    c.pathseg_count = a.pathseg_count + b.pathseg_count;
    return c;
}

State map_element(ElementRef ref) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
    uint tag = Element_tag(ref);
    State c;
    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
    c.translate = vec2(0.0, 0.0);
    c.linewidth = 1.0; // TODO should be 0.0
    c.flags = 0;
    c.path_count = 0;
    c.pathseg_count = 0;
    switch (tag) {
    case Element_FillLine:
    case Element_StrokeLine:
        LineSeg line = Element_FillLine_read(ref);
        c.bbox.xy = min(line.p0, line.p1);
        c.bbox.zw = max(line.p0, line.p1);
        c.pathseg_count = 1;
        break;
    case Element_FillQuad:
    case Element_StrokeQuad:
        QuadSeg quad = Element_FillQuad_read(ref);
        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
        c.pathseg_count = 1;
        break;
    case Element_FillCubic:
    case Element_StrokeCubic:
        CubicSeg cubic = Element_FillCubic_read(ref);
        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        c.pathseg_count = 1;
        break;
    case Element_Fill:
    case Element_FillTexture:
    case Element_Stroke:
    case Element_BeginClip:
        c.flags = FLAG_RESET_BBOX;
        c.path_count = 1;
        break;
    case Element_EndClip:
        c.path_count = 1;
        break;
    case Element_SetLineWidth:
        SetLineWidth lw = Element_SetLineWidth_read(ref);
        c.linewidth = lw.width;
        c.flags = FLAG_SET_LINEWIDTH;
        break;
    case Element_Transform:
        Transform t = Element_Transform_read(ref);
        c.mat = t.mat;
        c.translate = t.translate;
        break;
    }
    return c;
}

// Get the bounding box of a circle transformed by the matrix into an ellipse.
vec2 get_linewidth(State st) {
    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
}

// We should be able to use an array of structs but the NV shader compiler
// doesn't seem to like it :/
//shared State sh_state[WG_SIZE];
shared vec4 sh_mat[WG_SIZE];
shared vec2 sh_translate[WG_SIZE];
shared vec4 sh_bbox[WG_SIZE];
shared float sh_width[WG_SIZE];
shared uint sh_flags[WG_SIZE];
shared uint sh_path_count[WG_SIZE];
shared uint sh_pathseg_count[WG_SIZE];

shared uint sh_part_ix;
shared State sh_prefix;

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    State th_state[N_ROWS];
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
        sh_part_ix = atomicAdd(part_counter, 1);
    }
    barrier();
    uint part_ix = sh_part_ix;

    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);

    th_state[0] = map_element(ref);
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
    }
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
    sh_flags[gl_LocalInvocationID.x] = agg.flags;
    sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
    sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1 << i)) {
            State other;
            uint ix = gl_LocalInvocationID.x - (1 << i);
            other.mat = sh_mat[ix];
            other.translate = sh_translate[ix];
            other.bbox = sh_bbox[ix];
            other.linewidth = sh_width[ix];
            other.flags = sh_flags[ix];
            other.path_count = sh_path_count[ix];
            other.pathseg_count = sh_pathseg_count[ix];
            agg = combine_state(other, agg);
        }
        barrier();
        sh_mat[gl_LocalInvocationID.x] = agg.mat;
        sh_translate[gl_LocalInvocationID.x] = agg.translate;
        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
        sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
        sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
    }

    State exclusive;
    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
    exclusive.flags = 0;
    exclusive.path_count = 0;
    exclusive.pathseg_count = 0;

    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        // Note: with memory model, we'd want to generate the atomic store version of this.
        State_write(state_aggregate_ref(part_ix), agg);
        uint flag = FLAG_AGGREGATE_READY;
        memoryBarrierBuffer();
        if (part_ix == 0) {
            State_write(state_prefix_ref(part_ix), agg);
            flag = FLAG_PREFIX_READY;
        }
        state[state_flag_index(part_ix)] = flag;
        if (part_ix != 0) {
            // step 4 of paper: decoupled lookback
            uint look_back_ix = part_ix - 1;

            State their_agg;
            uint their_ix = 0;
            while (true) {
                flag = state[state_flag_index(look_back_ix)];
                if (flag == FLAG_PREFIX_READY) {
                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
                    exclusive = combine_state(their_prefix, exclusive);
                    break;
                } else if (flag == FLAG_AGGREGATE_READY) {
                    their_agg = State_read(state_aggregate_ref(look_back_ix));
                    exclusive = combine_state(their_agg, exclusive);
                    look_back_ix--;
                    their_ix = 0;
                    continue;
                }
                // else spin

                // Unfortunately there's no guarantee of forward progress of other
                // workgroups, so compute a bit of the aggregate before trying again.
                // In the worst case, spinning stops when the aggregate is complete.
                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
                State s = map_element(ref);
                if (their_ix == 0) {
                    their_agg = s;
                } else {
                    their_agg = combine_state(their_agg, s);
                }
                their_ix++;
                if (their_ix == PARTITION_SIZE) {
                    exclusive = combine_state(their_agg, exclusive);
                    if (look_back_ix == 0) {
                        break;
                    }
                    look_back_ix--;
                    their_ix = 0;
                }
            }

            // step 5 of paper: compute inclusive prefix
            State inclusive_prefix = combine_state(exclusive, agg);
            sh_prefix = exclusive;
            State_write(state_prefix_ref(part_ix), inclusive_prefix);
            memoryBarrierBuffer();
            flag = FLAG_PREFIX_READY;
            state[state_flag_index(part_ix)] = flag;
        }
    }
    barrier();
    if (part_ix != 0) {
        exclusive = sh_prefix;
    }

    State row = exclusive;
    if (gl_LocalInvocationID.x > 0) {
        uint ix = gl_LocalInvocationID.x - 1;
        State other;
        other.mat = sh_mat[ix];
        other.translate = sh_translate[ix];
        other.bbox = sh_bbox[ix];
        other.linewidth = sh_width[ix];
        other.flags = sh_flags[ix];
        other.path_count = sh_path_count[ix];
        other.pathseg_count = sh_pathseg_count[ix];
        row = combine_state(row, other);
    }
    for (uint i = 0; i < N_ROWS; i++) {
        State st = combine_state(row, th_state[i]);

        // Here we read again from the original scene. There may be
        // gains to be had from stashing in shared memory or possibly
        // registers (though register pressure is an issue).
        ElementRef this_ref = Element_index(ref, i);
        uint tag = Element_tag(this_ref);
        switch (tag) {
        case Element_FillLine:
        case Element_StrokeLine:
            LineSeg line = Element_StrokeLine_read(this_ref);
            vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
            vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
            PathStrokeCubic path_cubic;
            path_cubic.p0 = p0;
            path_cubic.p1 = mix(p0, p1, 1.0 / 3.0);
            path_cubic.p2 = mix(p1, p0, 1.0 / 3.0);
            path_cubic.p3 = p1;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeLine) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillQuad:
        case Element_StrokeQuad:
            QuadSeg quad = Element_StrokeQuad_read(this_ref);
            p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate;
            p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate;
            vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate;
            path_cubic;
            path_cubic.p0 = p0;
            path_cubic.p1 = mix(p1, p0, 1.0 / 3.0);
            path_cubic.p2 = mix(p1, p2, 1.0 / 3.0);
            path_cubic.p3 = p2;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeQuad) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillCubic:
        case Element_StrokeCubic:
            CubicSeg cubic = Element_StrokeCubic_read(this_ref);
            path_cubic;
            path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate;
            path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate;
            path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
            path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeCubic) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_Stroke:
            Stroke stroke = Element_Stroke_read(this_ref);
            AnnoStroke anno_stroke;
            anno_stroke.rgba_color = stroke.rgba_color;
            vec2 lw = get_linewidth(st);
            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
            anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
            Annotated_Stroke_write(conf.anno_alloc, out_ref, anno_stroke);
            break;
        case Element_Fill:
            Fill fill = Element_Fill_read(this_ref);
            AnnoFill anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
            anno_fill.bbox = st.bbox;
            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
            Annotated_Fill_write(conf.anno_alloc, out_ref, anno_fill);
            break;
        case Element_FillTexture:
            FillTexture fill_tex = Element_FillTexture_read(this_ref);
            AnnoFillTexture anno_fill_tex;
            anno_fill_tex.uv_bounds = fill_tex.uv_bounds;
            anno_fill_tex.bbox = st.bbox;
            anno_fill_tex.mat = st.mat;
            anno_fill_tex.translate = st.translate;
            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
            Annotated_FillTexture_write(conf.anno_alloc, out_ref, anno_fill_tex);
            break;
        case Element_BeginClip:
            Clip begin_clip = Element_BeginClip_read(this_ref);
            AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
            // This is the absolute bbox, it's been transformed during encoding.
            anno_begin_clip.bbox = begin_clip.bbox;
            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
            Annotated_BeginClip_write(conf.anno_alloc, out_ref, anno_begin_clip);
            break;
        case Element_EndClip:
            Clip end_clip = Element_EndClip_read(this_ref);
            // This bbox is expected to be the same as the begin one.
            AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
            break;
        }
    }
}

A gpu/shaders/kernel4.comp => gpu/shaders/kernel4.comp +302 -0
@@ 0,0 1,302 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
// in the per-tile command list to an image.

// Right now, this kernel stores the image in a buffer, but a better
// plan is to use a texture. This is because of limited support.

#version 450
#extension GL_GOOGLE_include_directive : enable
#ifdef VULKAN
#extension GL_EXT_nonuniform_qualifier : enable
#endif

#include "mem.h"
#include "setup.h"

#define CHUNK 8
#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;

#ifdef VULKAN
layout(set = 0, binding = 3) uniform sampler2D textures[];
#else
layout(set = 0, binding = 3) uniform sampler2D atlas;
#endif

#include "ptcl.h"
#include "tile.h"

#define BLEND_STACK_SIZE 4

// Layout of a clip scratch frame:
// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.

// Link offset and frame size in 32-bit words.
#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)

shared MallocResult sh_clip_alloc;

// Allocate a scratch buffer for clipping.
MallocResult alloc_clip_buf(uint link) {
    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
        MallocResult m = malloc(CLIP_BUF_SIZE * 4);
        if (!m.failed) {
            write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link);
        }
        sh_clip_alloc = m;
    }
    barrier();
    return sh_clip_alloc;
}

// Calculate coverage based on backdrop + coverage of each line segment
float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
    // Probably better to store as float, but conversion is no doubt cheap.
    float area[CHUNK];
    for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop);
    TileSegRef tile_seg_ref = TileSegRef(tile_ref);
    do {
        TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
        for (uint k = 0; k < CHUNK; k++) {
            vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
            vec2 start = seg.origin - my_xy;
            vec2 end = start + seg.vector;
            vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
            if (window.x != window.y) {
                vec2 t = (window - start.y) / seg.vector.y;
                vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
                float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
                float xmax = max(xs.x, xs.y);
                float b = min(xmax, 1.0);
                float c = max(b, 0.0);
                float d = max(xmin, 0.0);
                float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
                area[k] += a * (window.x - window.y);
            }
            area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
        }
        tile_seg_ref = seg.next;
    } while (tile_seg_ref.offset != 0);
    for (uint k = 0; k < CHUNK; k++) {
        area[k] = min(abs(area[k]), 1.0);
    }
    return area;
}

vec4[CHUNK] fillTexture(vec2 xy, CmdSolidTexture cmd_tex) {
    vec2 uvmin = unpackUnorm2x16(cmd_tex.uv_bounds.x);
    vec2 uvmax = unpackUnorm2x16(cmd_tex.uv_bounds.y);
    vec4 rgba[CHUNK];
    for (uint i = 0; i < CHUNK; i++) {
        float dy = float(i * CHUNK_DY);
        vec2 uv = vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5);
        uv = cmd_tex.mat.xy * uv.x + cmd_tex.mat.zw * uv.y + cmd_tex.translate;
        uv = clamp(uv, uvmin, uvmax);
#ifdef VULKAN
        vec4 fg_rgba = textureGrad(textures[0], uv, cmd_tex.mat.xy, cmd_tex.mat.zw);
#else
        vec4 fg_rgba = textureGrad(atlas, uv, cmd_tex.mat.xy, cmd_tex.mat.zw);
#endif
        rgba[i] = fg_rgba;
    }
    return rgba;
}

vec3 tosRGB(vec3 rgb) {
    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
    vec3 below = vec3(12.92)*rgb;
    vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
    return mix(below, above, cutoff);
}

// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
// space.
vec4 unpacksRGB(uint srgba) {
    vec4 color = unpackUnorm4x8(srgba).wzyx;
    // Formula from EXT_sRGB.
    vec3 rgb = color.rgb;
    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.04045));
    vec3 below = rgb/vec3(12.92);
    vec3 above = pow((rgb + vec3(0.055))/vec3(1.055), vec3(2.4));
    rgb = mix(below, above, cutoff);
    return vec4(rgb, color.a);
}

// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
uint packsRGB(vec4 rgba) {
    rgba = vec4(tosRGB(rgba.rgb), rgba.a);
    return packUnorm4x8(rgba.wzyx);
}

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);

    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
    vec3 rgb[CHUNK];
    float mask[CHUNK];
    uint blend_stack[BLEND_STACK_SIZE][CHUNK];
    uint blend_spill = 0;
    uint blend_sp = 0;
    Alloc clip_tos = new_alloc(0, 0);
    for (uint i = 0; i < CHUNK; i++) {
        rgb[i] = vec3(0.5);
#ifdef VULKAN
        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
            rgb[i] = texture(textures[gl_WorkGroupID.x / 64], vec2(xy_uint.x, xy_uint.y + CHUNK_DY * i) / 1024.0).rgb;
        }
#endif
        mask[i] = 1.0;
    }

    while (true) {
        uint tag = Cmd_tag(cmd_alloc, cmd_ref);
        if (tag == Cmd_End) {
            break;
        }
        switch (tag) {
        case Cmd_Circle:
            CmdCircle circle = Cmd_Circle_read(cmd_alloc, cmd_ref);
            vec4 fg_rgba = unpacksRGB(circle.rgba_color);
            for (uint i = 0; i < CHUNK; i++) {
                float dy = float(i * CHUNK_DY);
                float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
                float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
                rgb[i] = mix(rgb[i], fg_rgba.rgb, mask[i] * alpha * fg_rgba.a);
            }
            break;
        case Cmd_Stroke:
            // Calculate distance field from all the line segments in this tile.
            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
            float df[CHUNK];
            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
                vec2 line_vec = seg.vector;
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
                    dpos.y += float(k * CHUNK_DY);
                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
                    df[k] = min(df[k], length(line_vec * t - dpos));
                }
                tile_seg_ref = seg.next;
            } while (tile_seg_ref.offset != 0);
            fg_rgba = unpacksRGB(stroke.rgba_color);
            for (uint k = 0; k < CHUNK; k++) {
                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * alpha * fg_rgba.a);
            }
            break;
        case Cmd_Fill:
            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
            float area[CHUNK];
            area = computeArea(xy, fill.backdrop, fill.tile_ref);
            fg_rgba = unpacksRGB(fill.rgba_color);
            for (uint k = 0; k < CHUNK; k++) {
                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * area[k] * fg_rgba.a);
            }
            break;
        case Cmd_FillTexture:
            CmdFillTexture fill_tex = Cmd_FillTexture_read(cmd_alloc, cmd_ref);
            area = computeArea(xy, fill_tex.backdrop, fill_tex.tile_ref);
            vec4 rgba[CHUNK] = fillTexture(xy, CmdSolidTexture(fill_tex.mat, fill_tex.translate, fill_tex.uv_bounds));
            for (uint k = 0; k < CHUNK; k++) {
                rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * area[k] * rgba[k].a);
            }
            break;
        case Cmd_BeginClip:
        case Cmd_BeginSolidClip:
            uint blend_slot = blend_sp % BLEND_STACK_SIZE;
            if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
                // spill to scratch buffer
                MallocResult m = alloc_clip_buf(clip_tos.offset);
                if (m.failed) {
                    return;
                }
                clip_tos = m.alloc;
                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
                    write_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY, blend_stack[blend_slot][k]);
                }
                blend_spill++;
            }
            if (tag == Cmd_BeginClip) {
                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_alloc, cmd_ref);
                area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
                }
            } else {
                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_alloc, cmd_ref);
                float solid_alpha = begin_solid_clip.alpha;
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], solid_alpha));
                }
            }
            blend_sp++;
            break;
        case Cmd_EndClip:
            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
            blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
            if (blend_sp == blend_spill) {
                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY);
                }
                clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET);
                blend_spill--;
            }
            blend_sp--;
            for (uint k = 0; k < CHUNK; k++) {
                vec4 rgba = unpacksRGB(blend_stack[blend_slot][k]);
                rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
            }
            break;
        case Cmd_Solid:
            CmdSolid solid = Cmd_Solid_read(cmd_alloc, cmd_ref);
            fg_rgba = unpacksRGB(solid.rgba_color);
            for (uint k = 0; k < CHUNK; k++) {
                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a);
            }
            break;
        case Cmd_SolidTexture:
            CmdSolidTexture solid_tex = Cmd_SolidTexture_read(cmd_alloc, cmd_ref);
            rgba = fillTexture(xy, solid_tex);
            for (uint k = 0; k < CHUNK; k++) {
                rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * rgba[k].a);
            }
            break;
        case Cmd_SolidMask:
            CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_alloc, cmd_ref);
            for (uint k = 0; k < CHUNK; k++) {
                mask[k] = solid_mask.mask;
            }
            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
            cmd_alloc.offset = cmd_ref.offset;
            continue;
        }
        cmd_ref.offset += Cmd_size;
    }

    for (uint i = 0; i < CHUNK; i++) {
        imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(tosRGB(rgb[i]), 1.0));
    }
}

A gpu/shaders/mem.h => gpu/shaders/mem.h +120 -0
@@ 0,0 1,120 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

layout(set = 0, binding = 0) buffer Memory {
    // offset into memory of the next allocation, initialized by the user.
    uint mem_offset;
    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
    // If MEM_DEBUG is defined the following errors are reported:
    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
    uint mem_error;
    uint[] memory;
};

// Uncomment this line to add the size field to Alloc and enable memory checks.
// Note that the Config struct in setup.h grows size fields as well.
//#define MEM_DEBUG

#define NO_ERROR 0
#define ERR_MALLOC_FAILED 1
#define ERR_OUT_OF_BOUNDS 2
#define ERR_UNALIGNED_ACCESS 3

#define Alloc_size 8

// Alloc represents a memory allocation.
struct Alloc {
    // offset in bytes into memory.
    uint offset;
#ifdef MEM_DEBUG
    // size in bytes of the allocation.
    uint size;
#endif
};

struct MallocResult {
    Alloc alloc;
    // failed is true if the allocation overflowed memory.
    bool failed;
};

// new_alloc synthesizes an Alloc when its offset and size is derived.
Alloc new_alloc(uint offset, uint size) {
    Alloc a;
    a.offset = offset;
#ifdef MEM_DEBUG
    a.size = size;
#endif
    return a;
}

// malloc allocates size bytes of memory.
MallocResult malloc(uint size) {
    MallocResult r;
    r.failed = false;
    uint offset = atomicAdd(mem_offset, size);
    r.alloc = new_alloc(offset, size);
    if (offset + size > memory.length() * 4) {
        r.failed = true;
        atomicMax(mem_error, ERR_MALLOC_FAILED);
        return r;
    }
#ifdef MEM_DEBUG
    if ((size & 3) != 0) {
        r.failed = true;
        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
        return r;
    }
#endif
    return r;
}

// touch_mem checks whether access to the memory word at offset is valid.
// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
// Note that offset is in words.
bool touch_mem(Alloc alloc, uint offset) {
#ifdef MEM_DEBUG
    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
        return false;
    }
#endif
    return true;
}

// write_mem writes val to memory at offset.
// Note that offset is in words.
void write_mem(Alloc alloc, uint offset, uint val) {
    if (!touch_mem(alloc, offset)) {
        return;
    }
    memory[offset] = val;
}

// read_mem reads the value from memory at offset.
// Note that offset is in words.
uint read_mem(Alloc alloc, uint offset) {
    if (!touch_mem(alloc, offset)) {
        return 0;
    }
    uint v = memory[offset];
    return v;
}

// slice_mem returns a sub-allocation inside another. Note that offset and size
// are in bytes, relative to a.offset.
Alloc slice_mem(Alloc a, uint offset, uint size) {
#ifdef MEM_DEBUG
    if ((offset & 3) != 0 || (size & 3) != 0) {
        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
        return Alloc(0, 0);
    }
    if (offset + size > a.size) {
        // slice_mem is sometimes used for slices outside bounds,
        // but never written.
        return Alloc(0, 0);
    }
#endif
    return new_alloc(a.offset + offset, size);
}

A gpu/shaders/path_coarse.comp => gpu/shaders/path_coarse.comp +282 -0
@@ 0,0 1,282 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Coarse rasterization of path segments.

// Allocation and initialization of tiles for paths.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define LG_COARSE_WG 5
#define COARSE_WG (1 << LG_COARSE_WG)

layout(local_size_x = COARSE_WG, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "pathseg.h"
#include "tile.h"

// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))

#define ACCURACY 0.25
#define Q_ACCURACY (ACCURACY * 0.1)
#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)

vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
    float mt = 1.0 - t;
    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
}

vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
    float mt = 1.0 - t;
    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
}

struct SubdivResult {
    float val;
    float a0;
    float a2;
};

/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
///
/// This is used for flattening curves.
#define D 0.67
float approx_parabola_integral(float x) {
    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
}

/// An approximation to the inverse parabola integral.
#define B 0.39
float approx_parabola_inv_integral(float x) {
    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
}

SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
    vec2 d01 = p1 - p0;
    vec2 d12 = p2 - p1;
    vec2 dd = d01 - d12;
    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
    float scale = abs(cross / (length(dd) * (x2 - x0)));

    float a0 = approx_parabola_integral(x0);
    float a2 = approx_parabola_integral(x2);
    float val = 0.0;
    if (scale < 1e9) {
        float da = abs(a2 - a0);
        float sqrt_scale = sqrt(scale);
        if (sign(x0) == sign(x2)) {
            val = da * sqrt_scale;
        } else {
            float xmin = sqrt_tol / sqrt_scale;
            val = sqrt_tol * da / approx_parabola_integral(xmin);
        }
    }
    return SubdivResult(val, a0, a2);
}

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    uint element_ix = gl_GlobalInvocationID.x;
    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

    uint tag = PathSeg_Nop;
    if (element_ix < conf.n_pathseg) {
        tag = PathSeg_tag(conf.pathseg_alloc, ref);
    }
    switch (tag) {
    case PathSeg_FillCubic:
    case PathSeg_StrokeCubic:
        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(conf.pathseg_alloc, ref);
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
        // Iterate over quadratics and tote up the estimated number of segments.
        float val = 0.0;
        vec2 qp0 = cubic.p0;
        float step = 1.0 / float(n_quads);
        for (uint i = 0; i < n_quads; i++) {
            float t = float(i + 1) * step;
            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
            val += params.val;

            qp0 = qp2;
        }
        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);

        uint path_ix = cubic.path_ix;
        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
        float v_step = val / float(n);
        int n_out = 1;
        float val_sum = 0.0;
        for (uint i = 0; i < n_quads; i++) {
            float t = float(i + 1) * step;
            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
            float u0 = approx_parabola_inv_integral(params.a0);
            float u2 = approx_parabola_inv_integral(params.a2);
            float uscale = 1.0 / (u2 - u0);
            float target = float(n_out) * v_step;
            while (n_out == n || target < val_sum + params.val) {
                vec2 p1;
                if (n_out == n) {
                    p1 = cubic.p3;
                } else {
                    float u = (target - val_sum) / params.val;
                    float a = mix(params.a0, params.a2, u);
                    float au = approx_parabola_inv_integral(a);
                    float t = (au - u0) * uscale;
                    p1 = eval_quad(qp0, qp1, qp2, t);
                }

                // Output line segment

                // Bounding box of element in pixel coordinates.
                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
                float dx = p1.x - p0.x;
                float dy = p1.y - p0.y;
                // Set up for per-scanline coverage formula, below.
                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
                float b = invslope; // Note: assumes square tiles, otherwise scale.
                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;

                int x0 = int(floor(xmin * SX));
                int x1 = int(floor(xmax * SX) + 1);
                int y0 = int(floor(ymin * SY));
                int y1 = int(floor(ymax * SY) + 1);

                x0 = clamp(x0, bbox.x, bbox.z);
                y0 = clamp(y0, bbox.y, bbox.w);
                x1 = clamp(x1, bbox.x, bbox.z);
                y1 = clamp(y1, bbox.y, bbox.w);
                float xc = a + b * float(y0);
                int stride = bbox.z - bbox.x;
                int base = (y0 - bbox.y) * stride - bbox.x;
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
                if (tile_alloc.failed) {
                    return;
                }
                uint tile_offset = tile_alloc.alloc.offset;

                TileSeg tile_seg;

                int xray = int(floor(p0.x*SX));
                int last_xray = int(floor(p1.x*SX));
                if (p0.y > p1.y) {
                    int tmp = xray;
                    xray = last_xray;
                    last_xray = tmp;
                }
                for (int y = y0; y < y1; y++) {
                    float tile_y0 = float(y * TILE_HEIGHT_PX);
                    int xbackdrop = max(xray + 1, bbox.x);
                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
                        int backdrop = p1.y < p0.y ? 1 : -1;
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                        uint tile_el = tile_ref.offset >> 2;
                        if (touch_mem(path_alloc, tile_el + 1)) {
                            atomicAdd(memory[tile_el + 1], backdrop);
                        }
                    }

                    // next_xray is the xray for the next scanline; the line segment intersects
                    // all tiles between xray and next_xray.
                    int next_xray = last_xray;
                    if (y < y1 - 1) {
                        float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
                        float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
                        next_xray = int(floor(x_edge*SX));
                    }

                    int min_xray = min(xray, next_xray);
                    int max_xray = max(xray, next_xray);
                    int xx0 = min(int(floor(xc - c)), min_xray);
                    int xx1 = max(int(ceil(xc + c)), max_xray + 1);
                    xx0 = clamp(xx0, x0, x1);
                    xx1 = clamp(xx1, x0, x1);

                    for (int x = xx0; x < xx1; x++) {
                        float tile_x0 = float(x * TILE_WIDTH_PX);
                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
                        uint old = 0;
                        if (touch_mem(path_alloc, tile_el)) {
                            old = atomicExchange(memory[tile_el], tile_offset);
                        }
                        tile_seg.origin = p0;
                        tile_seg.vector = p1 - p0;
                        float y_edge = 0.0;
                        if (tag == PathSeg_FillCubic) {
                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
                            if (min(p0.x, p1.x) < tile_x0) {
                                vec2 p = vec2(tile_x0, y_edge);
                                if (p0.x > p1.x) {
                                    tile_seg.vector = p - p0;
                                } else {
                                    tile_seg.origin = p;
                                    tile_seg.vector = p1 - p;
                                }
                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
                                // Nudge zeroes towards the intended sign.
                                if (tile_seg.vector.x == 0) {
                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
                                }
                            }
                            if (x <= min_xray || max_xray < x) {
                                // Reject inconsistent intersections.
                                y_edge = 1e9;
                            }
                        }
                        tile_seg.y_edge = y_edge;
                        tile_seg.next.offset = old;
                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
                        tile_offset += TileSeg_size;
                    }
                    xc += b;
                    base += stride;
                    xray = next_xray;
                }

                n_out += 1;
                target += v_step;
                p0 = p1;
            }
            val_sum += params.val;

            qp0 = qp2;
        }

        break;
    }
}

A gpu/shaders/pathseg.h => gpu/shaders/pathseg.h +255 -0
@@ 0,0 1,255 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct PathFillLineRef {
    uint offset;
};

struct PathStrokeLineRef {
    uint offset;
};

struct PathFillCubicRef {
    uint offset;
};

struct PathStrokeCubicRef {
    uint offset;
};

struct PathSegRef {
    uint offset;
};

struct PathFillLine {
    vec2 p0;
    vec2 p1;
    uint path_ix;
};

#define PathFillLine_size 20

PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) {
    return PathFillLineRef(ref.offset + index * PathFillLine_size);
}

struct PathStrokeLine {
    vec2 p0;
    vec2 p1;
    uint path_ix;
    vec2 stroke;
};

#define PathStrokeLine_size 28

PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) {
    return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size);
}

struct PathFillCubic {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    uint path_ix;
};

#define PathFillCubic_size 36

PathFillCubicRef PathFillCubic_index(PathFillCubicRef ref, uint index) {
    return PathFillCubicRef(ref.offset + index * PathFillCubic_size);
}

struct PathStrokeCubic {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    uint path_ix;
    vec2 stroke;
};

#define PathStrokeCubic_size 44

PathStrokeCubicRef PathStrokeCubic_index(PathStrokeCubicRef ref, uint index) {
    return PathStrokeCubicRef(ref.offset + index * PathStrokeCubic_size);
}

#define PathSeg_Nop 0
#define PathSeg_FillLine 1
#define PathSeg_StrokeLine 2
#define PathSeg_FillCubic 3
#define PathSeg_StrokeCubic 4
#define PathSeg_size 48

PathSegRef PathSeg_index(PathSegRef ref, uint index) {
    return PathSegRef(ref.offset + index * PathSeg_size);
}

PathFillLine PathFillLine_read(Alloc a, PathFillLineRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    PathFillLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.path_ix = raw4;
    return s;
}

void PathFillLine_write(Alloc a, PathFillLineRef ref, PathFillLine s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
    write_mem(a, ix + 4, s.path_ix);
}

PathStrokeLine PathStrokeLine_read(Alloc a, PathStrokeLineRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    PathStrokeLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.path_ix = raw4;
    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
    return s;
}

void PathStrokeLine_write(Alloc a, PathStrokeLineRef ref, PathStrokeLine s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
    write_mem(a, ix + 4, s.path_ix);
    write_mem(a, ix + 5, floatBitsToUint(s.stroke.x));
    write_mem(a, ix + 6, floatBitsToUint(s.stroke.y));
}

PathFillCubic PathFillCubic_read(Alloc a, PathFillCubicRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    uint raw7 = read_mem(a, ix + 7);
    uint raw8 = read_mem(a, ix + 8);
    PathFillCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.path_ix = raw8;
    return s;
}

void PathFillCubic_write(Alloc a, PathFillCubicRef ref, PathFillCubic s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
    write_mem(a, ix + 8, s.path_ix);
}

PathStrokeCubic PathStrokeCubic_read(Alloc a, PathStrokeCubicRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    uint raw7 = read_mem(a, ix + 7);
    uint raw8 = read_mem(a, ix + 8);
    uint raw9 = read_mem(a, ix + 9);
    uint raw10 = read_mem(a, ix + 10);
    PathStrokeCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.path_ix = raw8;
    s.stroke = vec2(uintBitsToFloat(raw9), uintBitsToFloat(raw10));
    return s;
}

void PathStrokeCubic_write(Alloc a, PathStrokeCubicRef ref, PathStrokeCubic s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
    write_mem(a, ix + 8, s.path_ix);
    write_mem(a, ix + 9, floatBitsToUint(s.stroke.x));
    write_mem(a, ix + 10, floatBitsToUint(s.stroke.y));
}

uint PathSeg_tag(Alloc a, PathSegRef ref) {
    return read_mem(a, ref.offset >> 2);
}

PathFillLine PathSeg_FillLine_read(Alloc a, PathSegRef ref) {
    return PathFillLine_read(a, PathFillLineRef(ref.offset + 4));
}

PathStrokeLine PathSeg_StrokeLine_read(Alloc a, PathSegRef ref) {
    return PathStrokeLine_read(a, PathStrokeLineRef(ref.offset + 4));
}

PathFillCubic PathSeg_FillCubic_read(Alloc a, PathSegRef ref) {
    return PathFillCubic_read(a, PathFillCubicRef(ref.offset + 4));
}

PathStrokeCubic PathSeg_StrokeCubic_read(Alloc a, PathSegRef ref) {
    return PathStrokeCubic_read(a, PathStrokeCubicRef(ref.offset + 4));
}

void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
    write_mem(a, ref.offset >> 2, PathSeg_Nop);
}

void PathSeg_FillLine_write(Alloc a, PathSegRef ref, PathFillLine s) {
    write_mem(a, ref.offset >> 2, PathSeg_FillLine);
    PathFillLine_write(a, PathFillLineRef(ref.offset + 4), s);
}

void PathSeg_StrokeLine_write(Alloc a, PathSegRef ref, PathStrokeLine s) {
    write_mem(a, ref.offset >> 2, PathSeg_StrokeLine);
    PathStrokeLine_write(a, PathStrokeLineRef(ref.offset + 4), s);
}

void PathSeg_FillCubic_write(Alloc a, PathSegRef ref, PathFillCubic s) {
    write_mem(a, ref.offset >> 2, PathSeg_FillCubic);
    PathFillCubic_write(a, PathFillCubicRef(ref.offset + 4), s);
}

void PathSeg_StrokeCubic_write(Alloc a, PathSegRef ref, PathStrokeCubic s) {
    write_mem(a, ref.offset >> 2, PathSeg_StrokeCubic);
    PathStrokeCubic_write(a, PathStrokeCubicRef(ref.offset + 4), s);
}


A gpu/shaders/ptcl.h => gpu/shaders/ptcl.h +549 -0
@@ 0,0 1,549 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct CmdCircleRef {
    uint offset;
};

struct CmdLineRef {
    uint offset;
};

struct CmdStrokeRef {
    uint offset;
};

struct CmdFillRef {
    uint offset;
};

struct CmdFillTextureRef {
    uint offset;
};

struct CmdBeginClipRef {
    uint offset;
};

struct CmdBeginSolidClipRef {
    uint offset;
};

struct CmdEndClipRef {
    uint offset;
};

struct CmdSolidRef {
    uint offset;
};

struct CmdSolidTextureRef {
    uint offset;
};

struct CmdSolidMaskRef {
    uint offset;
};

struct CmdJumpRef {
    uint offset;
};

struct CmdRef {
    uint offset;
};

struct CmdCircle {
    vec2 center;
    float radius;
    uint rgba_color;
};

#define CmdCircle_size 16

CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
    return CmdCircleRef(ref.offset + index * CmdCircle_size);
}

struct CmdLine {
    vec2 start;
    vec2 end;
};

#define CmdLine_size 16

CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
    return CmdLineRef(ref.offset + index * CmdLine_size);
}

struct CmdStroke {
    uint tile_ref;
    float half_width;
    uint rgba_color;
};

#define CmdStroke_size 12

CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
}

struct CmdFill {
    uint tile_ref;
    int backdrop;
    uint rgba_color;
};

#define CmdFill_size 12

CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
    return CmdFillRef(ref.offset + index * CmdFill_size);
}

struct CmdFillTexture {
    uint tile_ref;
    int backdrop;
    vec4 mat;
    vec2 translate;
    uvec2 uv_bounds;
};

#define CmdFillTexture_size 40

CmdFillTextureRef CmdFillTexture_index(CmdFillTextureRef ref, uint index) {
    return CmdFillTextureRef(ref.offset + index * CmdFillTexture_size);
}

struct CmdBeginClip {
    uint tile_ref;
    int backdrop;
};

#define CmdBeginClip_size 8

CmdBeginClipRef CmdBeginClip_index(CmdBeginClipRef ref, uint index) {
    return CmdBeginClipRef(ref.offset + index * CmdBeginClip_size);
}

struct CmdBeginSolidClip {
    float alpha;
};

#define CmdBeginSolidClip_size 4

CmdBeginSolidClipRef CmdBeginSolidClip_index(CmdBeginSolidClipRef ref, uint index) {
    return CmdBeginSolidClipRef(ref.offset + index * CmdBeginSolidClip_size);
}

struct CmdEndClip {
    float alpha;
};

#define CmdEndClip_size 4

CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
}

struct CmdSolid {
    uint rgba_color;
};

#define CmdSolid_size 4

CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
    return CmdSolidRef(ref.offset + index * CmdSolid_size);
}

struct CmdSolidTexture {
    vec4 mat;
    vec2 translate;
    uvec2 uv_bounds;
};

#define CmdSolidTexture_size 32

CmdSolidTextureRef CmdSolidTexture_index(CmdSolidTextureRef ref, uint index) {
    return CmdSolidTextureRef(ref.offset + index * CmdSolidTexture_size);
}

struct CmdSolidMask {
    float mask;
};

#define CmdSolidMask_size 4

CmdSolidMaskRef CmdSolidMask_index(CmdSolidMaskRef ref, uint index) {
    return CmdSolidMaskRef(ref.offset + index * CmdSolidMask_size);
}

struct CmdJump {
    uint new_ref;
};

#define CmdJump_size 4

CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
    return CmdJumpRef(ref.offset + index * CmdJump_size);
}

#define Cmd_End 0
#define Cmd_Circle 1
#define Cmd_Line 2
#define Cmd_Fill 3
#define Cmd_FillTexture 4
#define Cmd_BeginClip 5
#define Cmd_BeginSolidClip 6
#define Cmd_EndClip 7
#define Cmd_Stroke 8
#define Cmd_Solid 9
#define Cmd_SolidMask 10
#define Cmd_SolidTexture 11
#define Cmd_Jump 12
#define Cmd_size 44

CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
}

CmdCircle CmdCircle_read(Alloc a, CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    CmdCircle s;
    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.radius = uintBitsToFloat(raw2);
    s.rgba_color = raw3;
    return s;
}

void CmdCircle_write(Alloc a, CmdCircleRef ref, CmdCircle s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.center.x));
    write_mem(a, ix + 1, floatBitsToUint(s.center.y));
    write_mem(a, ix + 2, floatBitsToUint(s.radius));
    write_mem(a, ix + 3, s.rgba_color);
}

CmdLine CmdLine_read(Alloc a, CmdLineRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    CmdLine s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
}

void CmdLine_write(Alloc a, CmdLineRef ref, CmdLine s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.start.x));
    write_mem(a, ix + 1, floatBitsToUint(s.start.y));
    write_mem(a, ix + 2, floatBitsToUint(s.end.x));
    write_mem(a, ix + 3, floatBitsToUint(s.end.y));
}

CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = uintBitsToFloat(raw1);
    s.rgba_color = raw2;
    return s;
}

void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.tile_ref);
    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
    write_mem(a, ix + 2, s.rgba_color);
}

CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
    s.rgba_color = raw2;
    return s;
}

void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.tile_ref);
    write_mem(a, ix + 1, uint(s.backdrop));
    write_mem(a, ix + 2, s.rgba_color);
}

CmdFillTexture CmdFillTexture_read(Alloc a, CmdFillTextureRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    uint raw7 = read_mem(a, ix + 7);
    uint raw8 = read_mem(a, ix + 8);
    uint raw9 = read_mem(a, ix + 9);
    CmdFillTexture s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
    s.mat = vec4(uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.translate = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.uv_bounds = uvec2(raw8, raw9);
    return s;
}

void CmdFillTexture_write(Alloc a, CmdFillTextureRef ref, CmdFillTexture s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.tile_ref);
    write_mem(a, ix + 1, uint(s.backdrop));
    write_mem(a, ix + 2, floatBitsToUint(s.mat.x));
    write_mem(a, ix + 3, floatBitsToUint(s.mat.y));
    write_mem(a, ix + 4, floatBitsToUint(s.mat.z));
    write_mem(a, ix + 5, floatBitsToUint(s.mat.w));
    write_mem(a, ix + 6, floatBitsToUint(s.translate.x));
    write_mem(a, ix + 7, floatBitsToUint(s.translate.y));
    write_mem(a, ix + 8, s.uv_bounds.x);
    write_mem(a, ix + 9, s.uv_bounds.y);
}

CmdBeginClip CmdBeginClip_read(Alloc a, CmdBeginClipRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    CmdBeginClip s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
    return s;
}

void CmdBeginClip_write(Alloc a, CmdBeginClipRef ref, CmdBeginClip s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.tile_ref);
    write_mem(a, ix + 1, uint(s.backdrop));
}

CmdBeginSolidClip CmdBeginSolidClip_read(Alloc a, CmdBeginSolidClipRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    CmdBeginSolidClip s;
    s.alpha = uintBitsToFloat(raw0);
    return s;
}

void CmdBeginSolidClip_write(Alloc a, CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
}

CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    CmdEndClip s;
    s.alpha = uintBitsToFloat(raw0);
    return s;
}

void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
}

CmdSolid CmdSolid_read(Alloc a, CmdSolidRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    CmdSolid s;
    s.rgba_color = raw0;
    return s;
}

void CmdSolid_write(Alloc a, CmdSolidRef ref, CmdSolid s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.rgba_color);
}

CmdSolidTexture CmdSolidTexture_read(Alloc a, CmdSolidTextureRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    uint raw6 = read_mem(a, ix + 6);
    uint raw7 = read_mem(a, ix + 7);
    CmdSolidTexture s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.uv_bounds = uvec2(raw6, raw7);
    return s;
}

void CmdSolidTexture_write(Alloc a, CmdSolidTextureRef ref, CmdSolidTexture s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.mat.x));
    write_mem(a, ix + 1, floatBitsToUint(s.mat.y));
    write_mem(a, ix + 2, floatBitsToUint(s.mat.z));
    write_mem(a, ix + 3, floatBitsToUint(s.mat.w));
    write_mem(a, ix + 4, floatBitsToUint(s.translate.x));
    write_mem(a, ix + 5, floatBitsToUint(s.translate.y));
    write_mem(a, ix + 6, s.uv_bounds.x);
    write_mem(a, ix + 7, s.uv_bounds.y);
}

CmdSolidMask CmdSolidMask_read(Alloc a, CmdSolidMaskRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    CmdSolidMask s;
    s.mask = uintBitsToFloat(raw0);
    return s;
}

void CmdSolidMask_write(Alloc a, CmdSolidMaskRef ref, CmdSolidMask s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.mask));
}

CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    CmdJump s;
    s.new_ref = raw0;
    return s;
}

void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.new_ref);
}

uint Cmd_tag(Alloc a, CmdRef ref) {
    return read_mem(a, ref.offset >> 2);
}

CmdCircle Cmd_Circle_read(Alloc a, CmdRef ref) {
    return CmdCircle_read(a, CmdCircleRef(ref.offset + 4));
}

CmdLine Cmd_Line_read(Alloc a, CmdRef ref) {
    return CmdLine_read(a, CmdLineRef(ref.offset + 4));
}

CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
}

CmdFillTexture Cmd_FillTexture_read(Alloc a, CmdRef ref) {
    return CmdFillTexture_read(a, CmdFillTextureRef(ref.offset + 4));
}

CmdBeginClip Cmd_BeginClip_read(Alloc a, CmdRef ref) {
    return CmdBeginClip_read(a, CmdBeginClipRef(ref.offset + 4));
}

CmdBeginSolidClip Cmd_BeginSolidClip_read(Alloc a, CmdRef ref) {
    return CmdBeginSolidClip_read(a, CmdBeginSolidClipRef(ref.offset + 4));
}

CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
}

CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
}

CmdSolid Cmd_Solid_read(Alloc a, CmdRef ref) {
    return CmdSolid_read(a, CmdSolidRef(ref.offset + 4));
}

CmdSolidMask Cmd_SolidMask_read(Alloc a, CmdRef ref) {
    return CmdSolidMask_read(a, CmdSolidMaskRef(ref.offset + 4));
}

CmdSolidTexture Cmd_SolidTexture_read(Alloc a, CmdRef ref) {
    return CmdSolidTexture_read(a, CmdSolidTextureRef(ref.offset + 4));
}

CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
}

void Cmd_End_write(Alloc a, CmdRef ref) {
    write_mem(a, ref.offset >> 2, Cmd_End);
}

void Cmd_Circle_write(Alloc a, CmdRef ref, CmdCircle s) {
    write_mem(a, ref.offset >> 2, Cmd_Circle);
    CmdCircle_write(a, CmdCircleRef(ref.offset + 4), s);
}

void Cmd_Line_write(Alloc a, CmdRef ref, CmdLine s) {
    write_mem(a, ref.offset >> 2, Cmd_Line);
    CmdLine_write(a, CmdLineRef(ref.offset + 4), s);
}

void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
    write_mem(a, ref.offset >> 2, Cmd_Fill);
    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
}

void Cmd_FillTexture_write(Alloc a, CmdRef ref, CmdFillTexture s) {
    write_mem(a, ref.offset >> 2, Cmd_FillTexture);
    CmdFillTexture_write(a, CmdFillTextureRef(ref.offset + 4), s);
}

void Cmd_BeginClip_write(Alloc a, CmdRef ref, CmdBeginClip s) {
    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
    CmdBeginClip_write(a, CmdBeginClipRef(ref.offset + 4), s);
}

void Cmd_BeginSolidClip_write(Alloc a, CmdRef ref, CmdBeginSolidClip s) {
    write_mem(a, ref.offset >> 2, Cmd_BeginSolidClip);
    CmdBeginSolidClip_write(a, CmdBeginSolidClipRef(ref.offset + 4), s);
}

void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
    write_mem(a, ref.offset >> 2, Cmd_EndClip);
    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
}

void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
    write_mem(a, ref.offset >> 2, Cmd_Stroke);
    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
}

void Cmd_Solid_write(Alloc a, CmdRef ref, CmdSolid s) {
    write_mem(a, ref.offset >> 2, Cmd_Solid);
    CmdSolid_write(a, CmdSolidRef(ref.offset + 4), s);
}

void Cmd_SolidMask_write(Alloc a, CmdRef ref, CmdSolidMask s) {
    write_mem(a, ref.offset >> 2, Cmd_SolidMask);
    CmdSolidMask_write(a, CmdSolidMaskRef(ref.offset + 4), s);
}

void Cmd_SolidTexture_write(Alloc a, CmdRef ref, CmdSolidTexture s) {
    write_mem(a, ref.offset >> 2, Cmd_SolidTexture);
    CmdSolidTexture_write(a, CmdSolidTextureRef(ref.offset + 4), s);
}

void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
    write_mem(a, ref.offset >> 2, Cmd_Jump);
    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
}


A gpu/shaders/scene.h => gpu/shaders/scene.h +320 -0
@@ 0,0 1,320 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct LineSegRef {
    uint offset;
};

struct QuadSegRef {
    uint offset;
};

struct CubicSegRef {
    uint offset;
};

struct FillRef {
    uint offset;
};

struct FillTextureRef {
    uint offset;
};

struct StrokeRef {
    uint offset;
};

struct SetLineWidthRef {
    uint offset;
};

struct TransformRef {
    uint offset;
};

struct ClipRef {
    uint offset;
};

struct ElementRef {
    uint offset;
};

struct LineSeg {
    vec2 p0;
    vec2 p1;
};

#define LineSeg_size 16

LineSegRef LineSeg_index(LineSegRef ref, uint index) {
    return LineSegRef(ref.offset + index * LineSeg_size);
}

struct QuadSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
};

#define QuadSeg_size 24

QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
    return QuadSegRef(ref.offset + index * QuadSeg_size);
}

struct CubicSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
};

#define CubicSeg_size 32

CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
    return CubicSegRef(ref.offset + index * CubicSeg_size);
}

struct Fill {
    uint rgba_color;
};

#define Fill_size 4

FillRef Fill_index(FillRef ref, uint index) {
    return FillRef(ref.offset + index * Fill_size);
}

struct FillTexture {
    uvec2 uv_bounds;
};

#define FillTexture_size 8

FillTextureRef FillTexture_index(FillTextureRef ref, uint index) {
    return FillTextureRef(ref.offset + index * FillTexture_size);
}

struct Stroke {
    uint rgba_color;
};

#define Stroke_size 4

StrokeRef Stroke_index(StrokeRef ref, uint index) {
    return StrokeRef(ref.offset + index * Stroke_size);
}

struct SetLineWidth {
    float width;
};

#define SetLineWidth_size 4

SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
}

struct Transform {
    vec4 mat;
    vec2 translate;
};

#define Transform_size 24

TransformRef Transform_index(TransformRef ref, uint index) {
    return TransformRef(ref.offset + index * Transform_size);
}

struct Clip {
    vec4 bbox;
};

#define Clip_size 16

ClipRef Clip_index(ClipRef ref, uint index) {
    return ClipRef(ref.offset + index * Clip_size);
}

#define Element_Nop 0
#define Element_StrokeLine 1
#define Element_FillLine 2
#define Element_StrokeQuad 3
#define Element_FillQuad 4
#define Element_StrokeCubic 5
#define Element_FillCubic 6
#define Element_Stroke 7
#define Element_Fill 8
#define Element_SetLineWidth 9
#define Element_Transform 10
#define Element_BeginClip 11
#define Element_EndClip 12
#define Element_FillTexture 13
#define Element_size 36

ElementRef Element_index(ElementRef ref, uint index) {
    return ElementRef(ref.offset + index * Element_size);
}

LineSeg LineSeg_read(LineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    LineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
}

QuadSeg QuadSeg_read(QuadSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    QuadSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
}

CubicSeg CubicSeg_read(CubicSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    uint raw6 = scene[ix + 6];
    uint raw7 = scene[ix + 7];
    CubicSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    return s;
}

Fill Fill_read(FillRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    Fill s;
    s.rgba_color = raw0;
    return s;
}

FillTexture FillTexture_read(FillTextureRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    FillTexture s;
    s.uv_bounds = uvec2(raw0, raw1);
    return s;
}

Stroke Stroke_read(StrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    Stroke s;
    s.rgba_color = raw0;
    return s;
}

SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    SetLineWidth s;
    s.width = uintBitsToFloat(raw0);
    return s;
}

Transform Transform_read(TransformRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    Transform s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
}

Clip Clip_read(ClipRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    Clip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
}

uint Element_tag(ElementRef ref) {
    return scene[ref.offset >> 2];
}

LineSeg Element_StrokeLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
}

LineSeg Element_FillLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
}

QuadSeg Element_StrokeQuad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
}

QuadSeg Element_FillQuad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
}

CubicSeg Element_StrokeCubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
}

CubicSeg Element_FillCubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
}

Stroke Element_Stroke_read(ElementRef ref) {
    return Stroke_read(StrokeRef(ref.offset + 4));
}

Fill Element_Fill_read(ElementRef ref) {
    return Fill_read(FillRef(ref.offset + 4));
}

SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
}

Transform Element_Transform_read(ElementRef ref) {
    return Transform_read(TransformRef(ref.offset + 4));
}

Clip Element_BeginClip_read(ElementRef ref) {
    return Clip_read(ClipRef(ref.offset + 4));
}

Clip Element_EndClip_read(ElementRef ref) {
    return Clip_read(ClipRef(ref.offset + 4));
}

FillTexture Element_FillTexture_read(ElementRef ref) {
    return FillTexture_read(FillTextureRef(ref.offset + 4));
}


A gpu/shaders/setup.h => gpu/shaders/setup.h +38 -0
@@ 0,0 1,38 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Various constants for the sizes of groups and tiles.

// Much of this will be made dynamic in various ways, but for now it's easiest
// to hardcode and keep all in one place.

// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
// maximum workgroup size of 128, or 1 for a maximum size of 256.
#define LG_WG_FACTOR 0
#define WG_FACTOR (1<<LG_WG_FACTOR)

#define TILE_WIDTH_PX 32
#define TILE_HEIGHT_PX 32

#define PTCL_INITIAL_ALLOC 1024

// These should probably be renamed and/or reworked. In the binning
// kernel, they represent the number of bins. Also, the workgroup size
// of that kernel is equal to the number of bins, but should probably
// be more flexible (it's 512 in the K&L paper).
#define N_TILE_X 16
#define N_TILE_Y (8 * WG_FACTOR)
#define N_TILE (N_TILE_X * N_TILE_Y)
#define LG_N_TILE (7 + LG_WG_FACTOR)
#define N_SLICE (N_TILE / 32)

struct Config {
    uint n_elements; // paths
    uint n_pathseg;
    uint width_in_tiles;
    uint height_in_tiles;
    Alloc tile_alloc;
    Alloc bin_alloc;
    Alloc ptcl_alloc;
    Alloc pathseg_alloc;
    Alloc anno_alloc;
};

A gpu/shaders/state.h => gpu/shaders/state.h +69 -0
@@ 0,0 1,69 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct StateRef {
    uint offset;
};

struct State {
    vec4 mat;
    vec2 translate;
    vec4 bbox;
    float linewidth;
    uint flags;
    uint path_count;
    uint pathseg_count;
};

#define State_size 56

StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
}

State State_read(StateRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = state[ix + 0];
    uint raw1 = state[ix + 1];
    uint raw2 = state[ix + 2];
    uint raw3 = state[ix + 3];
    uint raw4 = state[ix + 4];
    uint raw5 = state[ix + 5];
    uint raw6 = state[ix + 6];
    uint raw7 = state[ix + 7];
    uint raw8 = state[ix + 8];
    uint raw9 = state[ix + 9];
    uint raw10 = state[ix + 10];
    uint raw11 = state[ix + 11];
    uint raw12 = state[ix + 12];
    uint raw13 = state[ix + 13];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.linewidth = uintBitsToFloat(raw10);
    s.flags = raw11;
    s.path_count = raw12;
    s.pathseg_count = raw13;
    return s;
}

void State_write(StateRef ref, State s) {
    uint ix = ref.offset >> 2;
    state[ix + 0] = floatBitsToUint(s.mat.x);
    state[ix + 1] = floatBitsToUint(s.mat.y);
    state[ix + 2] = floatBitsToUint(s.mat.z);
    state[ix + 3] = floatBitsToUint(s.mat.w);
    state[ix + 4] = floatBitsToUint(s.translate.x);
    state[ix + 5] = floatBitsToUint(s.translate.y);
    state[ix + 6] = floatBitsToUint(s.bbox.x);
    state[ix + 7] = floatBitsToUint(s.bbox.y);
    state[ix + 8] = floatBitsToUint(s.bbox.z);
    state[ix + 9] = floatBitsToUint(s.bbox.w);
    state[ix + 10] = floatBitsToUint(s.linewidth);
    state[ix + 11] = s.flags;
    state[ix + 12] = s.path_count;
    state[ix + 13] = s.pathseg_count;
}


A gpu/shaders/tile.h => gpu/shaders/tile.h +111 -0
@@ 0,0 1,111 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Code auto-generated by piet-gpu-derive

struct PathRef {
    uint offset;
};

struct TileRef {
    uint offset;
};

struct TileSegRef {
    uint offset;
};

struct Path {
    uvec4 bbox;
    TileRef tiles;
};

#define Path_size 12

PathRef Path_index(PathRef ref, uint index) {
    return PathRef(ref.offset + index * Path_size);
}

struct Tile {
    TileSegRef tile;
    int backdrop;
};

#define Tile_size 8

TileRef Tile_index(TileRef ref, uint index) {
    return TileRef(ref.offset + index * Tile_size);
}

struct TileSeg {
    vec2 origin;
    vec2 vector;
    float y_edge;
    TileSegRef next;
};

#define TileSeg_size 24

TileSegRef TileSeg_index(TileSegRef ref, uint index) {
    return TileSegRef(ref.offset + index * TileSeg_size);
}

Path Path_read(Alloc a, PathRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    Path s;
    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
    s.tiles = TileRef(raw2);
    return s;
}

void Path_write(Alloc a, PathRef ref, Path s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
    write_mem(a, ix + 2, s.tiles.offset);
}

Tile Tile_read(Alloc a, TileRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    Tile s;
    s.tile = TileSegRef(raw0);
    s.backdrop = int(raw1);
    return s;
}

void Tile_write(Alloc a, TileRef ref, Tile s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, s.tile.offset);
    write_mem(a, ix + 1, uint(s.backdrop));
}

TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
    uint raw5 = read_mem(a, ix + 5);
    TileSeg s;
    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.y_edge = uintBitsToFloat(raw4);
    s.next = TileSegRef(raw5);
    return s;
}

void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
    uint ix = ref.offset >> 2;
    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
    write_mem(a, ix + 5, s.next.offset);
}


A gpu/shaders/tile_alloc.comp => gpu/shaders/tile_alloc.comp +109 -0
@@ 0,0 1,109 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// Allocation and initialization of tiles for paths.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)

layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "annotated.h"
#include "tile.h"

// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))

shared uint sh_tile_count[TILE_ALLOC_WG];
shared MallocResult sh_tile_alloc;

void main() {
    if (mem_error != NO_ERROR) {
        return;
    }

    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

    uint tag = Annotated_Nop;
    if (element_ix < conf.n_elements) {
        tag = Annotated_tag(conf.anno_alloc, ref);
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
    case Annotated_Fill:
    case Annotated_FillTexture:
    case Annotated_Stroke:
    case Annotated_BeginClip:
    case Annotated_EndClip:
        // Note: we take advantage of the fact that fills, strokes, and
        // clips have compatible layout.
        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
        x0 = int(floor(fill.bbox.x * SX));
        y0 = int(floor(fill.bbox.y * SY));
        x1 = int(ceil(fill.bbox.z * SX));
        y1 = int(ceil(fill.bbox.w * SY));
        break;
    }
    x0 = clamp(x0, 0, int(conf.width_in_tiles));
    y0 = clamp(y0, 0, int(conf.height_in_tiles));
    x1 = clamp(x1, 0, int(conf.width_in_tiles));
    y1 = clamp(y1, 0, int(conf.height_in_tiles));

    Path path;
    path.bbox = uvec4(x0, y0, x1, y1);
    uint tile_count = (x1 - x0) * (y1 - y0);
    if (tag == Annotated_EndClip) {
        // Don't actually allocate tiles for an end clip, but we do want
        // the path structure (especially bbox) allocated for it.
        tile_count = 0;
    }

    sh_tile_count[th_ix] = tile_count;
    uint total_tile_count = tile_count;
    // Prefix sum of sh_tile_count
    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
            total_tile_count += sh_tile_count[th_ix - (1 << i)];
        }
        barrier();
        sh_tile_count[th_ix] = total_tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
        sh_tile_alloc = malloc(total_tile_count * Tile_size);
    }
    barrier();
    MallocResult alloc_start = sh_tile_alloc;
    if (alloc_start.failed) {
        return;
    }

    if (element_ix < conf.n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
        path.tiles = TileRef(tiles_alloc.offset);
        Path_write(conf.tile_alloc, path_ref, path);
    }

    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
    uint start_ix = alloc_start.alloc.offset >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
        // Note: this interleaving is faster than using Tile_write
        // by a significant amount.
        write_mem(alloc_start.alloc, start_ix + i, 0);
    }
}