~eliasnaur/gio: gpu: [compute] add compute renderer specific decoding of ops

3 files changed, 424 insertions(+), 200 deletions(-)

M gpu/caches.go
M gpu/compute.go
M gpu/gpu.go

M gpu/caches.go => gpu/caches.go +0 -2

@@ 26,8 26,6 @@ type opCache struct {
 
 type opCacheValue struct {
 	data pathData
-	// computePath is the encoded path for compute.
-	computePath encoder
 
 	bounds f32.Rectangle
 	// the fields below are handled by opCache

M gpu/compute.go => gpu/compute.go +407 -137

@@ 8,6 8,7 @@ import (
 	"fmt"
 	"image"
 	"image/color"
+	"math"
 	"math/bits"
 	"time"
 	"unsafe"


@@ 16,19 17,21 @@ import (
 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/byteslice"
 	"gioui.org/internal/f32color"
+	"gioui.org/internal/opconst"
 	"gioui.org/internal/ops"
 	"gioui.org/internal/scene"
 	"gioui.org/layout"
 	"gioui.org/op"
+	"gioui.org/op/clip"
 )
 
 type compute struct {
 	ctx driver.Device
-	enc encoder
 
-	drawOps       drawOps
+	collector     collector
+	enc           encoder
 	texOps        []textureOp
-	cache         *resourceCache
+	viewport      image.Point
 	maxTextureDim int
 
 	programs struct {


@@ 105,6 108,59 @@ type materialUniforms struct {
 	pos   [2]float32
 }
 
+type collector struct {
+	profile      bool
+	reader       ops.Reader
+	states       []encoderState
+	clear        bool
+	clearColor   f32color.RGBA
+	clipCache    []clipState
+	clipCmdCache []clipCmd
+	paintOps     []paintOp
+}
+
+type paintOp struct {
+	clipStack []clipCmd
+	state     encoderState
+}
+
+// clipCmd describes a clipping command ready to be used for the compute
+// pipeline.
+type clipCmd struct {
+	// union of the bounds of the operations that are clipped.
+	union    f32.Rectangle
+	state    *clipState
+	relTrans f32.Affine2D
+}
+
+type encoderState struct {
+	t         f32.Affine2D
+	relTrans  f32.Affine2D
+	clip      *clipState
+	intersect f32.Rectangle
+
+	matType materialType
+	// Current paint.ImageOp
+	image imageOpData
+	// Current paint.ColorOp, if any.
+	color color.NRGBA
+
+	// Current paint.LinearGradientOp.
+	stop1  f32.Point
+	stop2  f32.Point
+	color1 color.NRGBA
+	color2 color.NRGBA
+}
+
+type clipState struct {
+	bounds    f32.Rectangle
+	absBounds f32.Rectangle
+	pathVerts []byte
+	parent    *clipState
+	relTrans  f32.Affine2D
+	stroke    clip.StrokeStyle
+}
+
 // materialVertex describes a vertex of a quad used to render a transformed
 // material.
 type materialVertex struct {


@@ 118,13 174,15 @@ type textureKey struct {
 	transform f32.Affine2D
 }
 
-// textureOp represents an imageOp that requires texture space.
+// textureOp represents an paintOp that requires texture space.
 type textureOp struct {
 	// sceneIdx is the index in the scene that contains the fill image command
 	// that corresponds to the operation.
 	sceneIdx int
-	key      textureKey
 	img      imageOpData
+	key      textureKey
+	// offset is the integer offset, separated from key.transform to increase cache hit rate.
+	off image.Point
 
 	// pos is the position of the untransformed image in the images texture.
 	pos image.Point


@@ 173,6 231,9 @@ type memoryHeader struct {
 	mem_error  uint32
 }
 
+// rect is a oriented rectangle.
+type rectangle [4]f32.Point
+
 // GPU structure sizes and constants.
 const (
 	tileWidthPx       = 32


@@ 205,7 266,6 @@ func newCompute(ctx driver.Device) (*compute, error) {
 	}
 	g := &compute{
 		ctx:           ctx,
-		cache:         newResourceCache(),
 		maxTextureDim: maxDim,
 		conf:          new(config),
 		memHeader:     new(memoryHeader),


@@ 243,9 303,6 @@ func newCompute(ctx driver.Device) (*compute, error) {
 	g.materials.uniBuf = buf
 	g.materials.prog.SetVertexUniforms(buf)
 
-	g.drawOps.pathCache = newOpCache()
-	g.drawOps.compute = true
-
 	buf, err = ctx.NewBuffer(driver.BufferBindingShaderStorage, int(unsafe.Sizeof(config{})))
 	if err != nil {
 		g.Release()


@@ 277,30 334,33 @@ func newCompute(ctx driver.Device) (*compute, error) {
 }
 
 func (g *compute) Collect(viewport image.Point, ops *op.Ops) {
-	g.drawOps.reset(g.cache, viewport)
-	g.drawOps.collect(g.ctx, g.cache, ops, viewport)
-	for _, img := range g.drawOps.allImageOps {
-		expandPathOp(img.path, img.clip)
-	}
-	g.encode(viewport)
+	g.viewport = viewport
+	g.collector.reset()
+	g.enc.reset()
+	g.texOps = g.texOps[:0]
+
+	// Flip Y-axis.
+	flipY := f32.Affine2D{}.Scale(f32.Pt(0, 0), f32.Pt(1, -1)).Offset(f32.Pt(0, float32(viewport.Y)))
+	g.collector.collect(ops, flipY, viewport)
+	g.collector.encode(viewport, &g.enc, &g.texOps)
 }
 
 func (g *compute) Clear(col color.NRGBA) {
-	g.drawOps.clear = true
-	g.drawOps.clearColor = f32color.LinearFromSRGB(col)
+	g.collector.clear = true
+	g.collector.clearColor = f32color.LinearFromSRGB(col)
 }
 
 func (g *compute) Frame() error {
-	viewport := g.drawOps.viewport
+	viewport := g.viewport
 	tileDims := image.Point{
 		X: (viewport.X + tileWidthPx - 1) / tileWidthPx,
 		Y: (viewport.Y + tileHeightPx - 1) / tileHeightPx,
 	}
 
-	defFBO := g.ctx.BeginFrame(g.drawOps.clear, viewport)
+	defFBO := g.ctx.BeginFrame(g.collector.clear, viewport)
 	defer g.ctx.EndFrame()
 
-	if g.drawOps.profile && g.timers.t == nil && g.ctx.Caps().Features.Has(driver.FeatureTimers) {
+	if g.collector.profile && g.timers.t == nil && g.ctx.Caps().Features.Has(driver.FeatureTimers) {
 		t := &g.timers
 		t.t = newTimers(g.ctx)
 		t.materials = g.timers.t.newTimer()


@@ 327,10 387,8 @@ func (g *compute) Frame() error {
 	}
 	g.ctx.BindFramebuffer(defFBO)
 	g.blitOutput(viewport)
-	g.cache.frame()
-	g.drawOps.pathCache.frame()
 	t := &g.timers
-	if g.drawOps.profile && t.t.ready() {
+	if g.collector.profile && t.t.ready() {
 		mat := t.materials.Elapsed
 		et, tat, pct, bbt := t.elements.Elapsed, t.tileAlloc.Elapsed, t.pathCoarse.Elapsed, t.backdropBinning.Elapsed
 		ct, k4t := t.coarse.Elapsed, t.kernel4.Elapsed


@@ 344,7 402,7 @@ func (g *compute) Frame() error {
 		blit = blit.Round(q)
 		t.profile = fmt.Sprintf("ft:%7s mat: %7s et:%7s tat:%7s pct:%7s bbt:%7s ct:%7s k4t:%7s blit:%7s", ft, mat, et, tat, pct, bbt, ct, k4t, blit)
 	}
-	g.drawOps.clear = false
+	g.collector.clear = false
 	return nil
 }
 


@@ 359,7 417,7 @@ func (g *compute) Profile() string {
 func (g *compute) blitOutput(viewport image.Point) {
 	t := g.timers.blit
 	t.begin()
-	if !g.drawOps.clear {
+	if !g.collector.clear {
 		g.ctx.BlendFunc(driver.BlendFactorOne, driver.BlendFactorOneMinusSrcAlpha)
 		g.ctx.SetBlend(true)
 		defer g.ctx.SetBlend(false)


@@ 371,20 429,6 @@ func (g *compute) blitOutput(viewport image.Point) {
 	t.end()
 }
 
-func (g *compute) encode(viewport image.Point) {
-	g.texOps = g.texOps[:0]
-	g.enc.reset()
-
-	// Flip Y-axis.
-	flipY := f32.Affine2D{}.Scale(f32.Pt(0, 0), f32.Pt(1, -1)).Offset(f32.Pt(0, float32(viewport.Y)))
-	g.enc.transform(flipY)
-	if g.drawOps.clear {
-		g.enc.rect(f32.Rectangle{Max: layout.FPt(viewport)})
-		g.enc.fillColor(f32color.NRGBAToRGBA(g.drawOps.clearColor.SRGB()))
-	}
-	g.encodeOps(flipY, viewport, g.drawOps.allImageOps)
-}
-
 func (g *compute) renderMaterials() error {
 	m := &g.materials
 	m.quads = m.quads[:0]


@@ 394,7 438,7 @@ restart:
 	for {
 		for _, op := range g.texOps {
 			if off, exists := m.offsets[op.key]; exists {
-				g.enc.setFillImageOffset(op.sceneIdx, off)
+				g.enc.setFillImageOffset(op.sceneIdx, off.Sub(op.off))
 				continue
 			}
 			quad, bounds := g.materialQuad(op.key.transform, op.img, op.pos)


@@ 434,7 478,7 @@ restart:
 				m.offsets = make(map[textureKey]image.Point)
 			}
 			m.offsets[op.key] = offset
-			g.enc.setFillImageOffset(op.sceneIdx, offset)
+			g.enc.setFillImageOffset(op.sceneIdx, offset.Sub(op.off))
 		}
 		break
 	}


@@ 636,86 680,13 @@ func min(p1, p2 f32.Point) f32.Point {
 	return p
 }
 
-func (g *compute) encodeOps(trans f32.Affine2D, viewport image.Point, ops []imageOp) {
-	for _, op := range ops {
-		bounds := layout.FRect(op.clip)
-		// clip is the union of all drawing affected by the clipping
-		// operation. TODO: tighten.
-		clip := f32.Rect(0, 0, float32(viewport.X), float32(viewport.Y))
-		nclips := g.encodeClipStack(clip, bounds, op.path, false)
-		m := op.material
-		switch m.material {
-		case materialTexture:
-			t := trans.Mul(m.trans)
-			g.texOps = append(g.texOps, textureOp{
-				sceneIdx: len(g.enc.scene),
-				img:      m.data,
-				key: textureKey{
-					transform: t,
-					handle:    m.data.handle,
-				},
-			})
-			// Add fill command, its offset is resolved and filled in renderMaterials.
-			g.enc.fillImage(0)
-		case materialColor:
-			g.enc.fillColor(f32color.NRGBAToRGBA(op.material.color.SRGB()))
-		case materialLinearGradient:
-			// TODO: implement.
-			g.enc.fillColor(f32color.NRGBAToRGBA(op.material.color1.SRGB()))
-		default:
-			panic("not implemented")
-		}
-		if op.path != nil && op.path.path {
-			g.enc.fillMode(scene.FillModeNonzero)
-			g.enc.transform(op.path.trans.Invert())
-		}
-		// Pop the clip stack.
-		for i := 0; i < nclips; i++ {
-			g.enc.endClip(clip)
-		}
-	}
-}
-
-// encodeClips encodes a stack of clip paths and return the stack depth.
-func (g *compute) encodeClipStack(clip, bounds f32.Rectangle, p *pathOp, begin bool) int {
-	nclips := 0
-	if p != nil && p.parent != nil {
-		nclips += g.encodeClipStack(clip, bounds, p.parent, true)
-		nclips += 1
-	}
-	isStroke := p.stroke.Width > 0
-	if p != nil && p.path {
-		if isStroke {
-			g.enc.fillMode(scene.FillModeStroke)
-			g.enc.lineWidth(p.stroke.Width)
-		}
-		pathData, _ := g.drawOps.pathCache.get(p.pathKey)
-		g.enc.transform(p.trans)
-		g.enc.append(pathData.computePath)
-	} else {
-		g.enc.rect(bounds)
-	}
-	if begin {
-		g.enc.beginClip(clip)
-		if isStroke {
-			g.enc.fillMode(scene.FillModeNonzero)
-		}
-		if p != nil && p.path {
-			g.enc.transform(p.trans.Invert())
-		}
-	}
-	return nclips
-}
-
-func encodePath(verts []byte) encoder {
-	var enc encoder
+func (enc *encoder) encodePath(verts []byte) {
 	for len(verts) >= scene.CommandSize+4 {
 		cmd := ops.DecodeCommand(verts[4:])
 		enc.scene = append(enc.scene, cmd)
 		enc.npathseg++
 		verts = verts[scene.CommandSize+4:]
 	}
-	return enc
 }
 
 func (g *compute) render(tileDims image.Point) error {


@@ 731,12 702,13 @@ func (g *compute) render(tileDims image.Point) error {
 		return fmt.Errorf("gpu: output too large (%dx%d)", tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx)
 	}
 
+	enc := &g.enc
 	// Pad scene with zeroes to avoid reading garbage in elements.comp.
-	scenePadding := partitionSize - len(g.enc.scene)%partitionSize
-	g.enc.scene = append(g.enc.scene, make([]scene.Command, scenePadding)...)
+	scenePadding := partitionSize - len(enc.scene)%partitionSize
+	enc.scene = append(enc.scene, make([]scene.Command, scenePadding)...)
 
 	realloced := false
-	scene := byteslice.Slice(g.enc.scene)
+	scene := byteslice.Slice(enc.scene)
 	if s := len(scene); s > g.buffers.scene.size {
 		realloced = true
 		paddedCap := s * 11 / 10


@@ 770,19 742,19 @@ func (g *compute) render(tileDims image.Point) error {
 	}
 
 	*g.conf = config{
-		n_elements:      uint32(g.enc.npath),
-		n_pathseg:       uint32(g.enc.npathseg),
+		n_elements:      uint32(enc.npath),
+		n_pathseg:       uint32(enc.npathseg),
 		width_in_tiles:  uint32(tileDims.X),
 		height_in_tiles: uint32(tileDims.Y),
-		tile_alloc:      malloc(g.enc.npath * pathSize),
-		bin_alloc:       malloc(round(g.enc.npath, wgSize) * binSize),
+		tile_alloc:      malloc(enc.npath * pathSize),
+		bin_alloc:       malloc(round(enc.npath, wgSize) * binSize),
 		ptcl_alloc:      malloc(tileDims.X * tileDims.Y * ptclInitialAlloc),
-		pathseg_alloc:   malloc(g.enc.npathseg * pathsegSize),
-		anno_alloc:      malloc(g.enc.npath * annoSize),
-		trans_alloc:     malloc(g.enc.ntrans * transSize),
+		pathseg_alloc:   malloc(enc.npathseg * pathsegSize),
+		anno_alloc:      malloc(enc.npath * annoSize),
+		trans_alloc:     malloc(enc.ntrans * transSize),
 	}
 
-	numPartitions := (g.enc.numElements() + 127) / 128
+	numPartitions := (enc.numElements() + 127) / 128
 	// clearSize is the atomic partition counter plus flag and 2 states per partition.
 	clearSize := 4 + numPartitions*stateStride
 	if clearSize > g.buffers.state.size {


@@ 825,20 797,20 @@ func (g *compute) render(tileDims image.Point) error {
 		t.elements.end()
 		t.tileAlloc.begin()
 		g.ctx.BindProgram(g.programs.tileAlloc)
-		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
+		g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
 		g.ctx.MemoryBarrier()
 		t.tileAlloc.end()
 		t.pathCoarse.begin()
 		g.ctx.BindProgram(g.programs.pathCoarse)
-		g.ctx.DispatchCompute((g.enc.npathseg+31)/32, 1, 1)
+		g.ctx.DispatchCompute((enc.npathseg+31)/32, 1, 1)
 		g.ctx.MemoryBarrier()
 		t.pathCoarse.end()
 		t.backdropBinning.begin()
 		g.ctx.BindProgram(g.programs.backdrop)
-		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
+		g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
 		// No barrier needed between backdrop and binning.
 		g.ctx.BindProgram(g.programs.binning)
-		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
+		g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
 		g.ctx.MemoryBarrier()
 		t.backdropBinning.end()
 		t.coarse.begin()


@@ 901,12 873,6 @@ func (g *compute) resizeOutput(size image.Point) error {
 }
 
 func (g *compute) Release() {
-	if g.drawOps.pathCache != nil {
-		g.drawOps.pathCache.release()
-	}
-	if g.cache != nil {
-		g.cache.release()
-	}
 	progs := []driver.Program{
 		g.programs.elements,
 		g.programs.tileAlloc,


@@ 1062,9 1028,11 @@ func (e *encoder) setFillImageOffset(index int, offset image.Point) {
 	e.scene[index][2] = uint32(uint16(x)) | uint32(uint16(y))<<16
 }
 
-func (e *encoder) fillImage(index int) {
+func (e *encoder) fillImage(index int) int {
+	idx := len(e.scene)
 	e.scene = append(e.scene, scene.FillImage(index))
 	e.npath++
+	return idx
 }
 
 func (e *encoder) line(start, end f32.Point) {


@@ 1076,3 1044,305 @@ func (e *encoder) quad(start, ctrl, end f32.Point) {
 	e.scene = append(e.scene, scene.Quad(start, ctrl, end))
 	e.npathseg++
 }
+
+func (c *collector) reset() {
+	c.profile = false
+	c.clipCache = c.clipCache[:0]
+	c.clipCmdCache = c.clipCmdCache[:0]
+	c.paintOps = c.paintOps[:0]
+}
+
+func (c *collector) addClip(state *encoderState, viewport, bounds f32.Rectangle, path []byte, stroke clip.StrokeStyle) {
+	// Rectangle clip regions.
+	if len(path) == 0 {
+		transView := transformBounds(state.t.Invert(), viewport)
+		// If the rectangular clip contains the viewport it can be discarded.
+		if transView.In(bounds) {
+			return
+		}
+		// If the rectangular clip region contains a previous path it can be discarded.
+		p := state.clip
+		t := state.relTrans.Invert()
+		for p != nil {
+			// rect is the parent bounds transformed relative to the rectangle.
+			rect := transformBounds(t, p.bounds)
+			if rect.In(bounds) {
+				return
+			}
+			t = p.relTrans.Invert().Mul(t)
+			p = p.parent
+		}
+	}
+
+	absBounds := transformBounds(state.t, bounds).Bounds()
+	c.clipCache = append(c.clipCache, clipState{
+		parent:    state.clip,
+		bounds:    bounds,
+		absBounds: absBounds,
+		relTrans:  state.relTrans,
+		stroke:    stroke,
+		pathVerts: path,
+	})
+	state.intersect = state.intersect.Intersect(absBounds)
+	state.clip = &c.clipCache[len(c.clipCache)-1]
+	state.relTrans = f32.Affine2D{}
+}
+
+func (c *collector) collect(root *op.Ops, trans f32.Affine2D, viewport image.Point) {
+	fview := f32.Rectangle{Max: layout.FPt(viewport)}
+	c.reader.Reset(root)
+	state := encoderState{
+		color:     color.NRGBA{A: 0xff},
+		intersect: fview,
+		t:         trans,
+		relTrans:  trans,
+	}
+	r := &c.reader
+	var (
+		pathData []byte
+		str      clip.StrokeStyle
+	)
+	c.save(opconst.InitialStateID, state)
+	for encOp, ok := r.Decode(); ok; encOp, ok = r.Decode() {
+		switch opconst.OpType(encOp.Data[0]) {
+		case opconst.TypeProfile:
+			c.profile = true
+		case opconst.TypeTransform:
+			dop := ops.DecodeTransform(encOp.Data)
+			state.t = state.t.Mul(dop)
+			state.relTrans = state.relTrans.Mul(dop)
+		case opconst.TypeStroke:
+			str = decodeStrokeOp(encOp.Data)
+		case opconst.TypePath:
+			encOp, ok = r.Decode()
+			if !ok {
+				panic("unexpected end of path operation")
+			}
+			pathData = encOp.Data[opconst.TypeAuxLen:]
+
+		case opconst.TypeClip:
+			var op clipOp
+			op.decode(encOp.Data)
+			c.addClip(&state, fview, op.bounds, pathData, str)
+			pathData = nil
+			str = clip.StrokeStyle{}
+		case opconst.TypeColor:
+			state.matType = materialColor
+			state.color = decodeColorOp(encOp.Data)
+		case opconst.TypeLinearGradient:
+			state.matType = materialLinearGradient
+			op := decodeLinearGradientOp(encOp.Data)
+			state.stop1 = op.stop1
+			state.stop2 = op.stop2
+			state.color1 = op.color1
+			state.color2 = op.color2
+		case opconst.TypeImage:
+			state.matType = materialTexture
+			state.image = decodeImageOp(encOp.Data, encOp.Refs)
+		case opconst.TypePaint:
+			paintState := state
+			if paintState.matType == materialTexture {
+				// Clip to the bounds of the image, to hide other images in the atlas.
+				bounds := paintState.image.src.Bounds()
+				c.addClip(&paintState, fview, layout.FRect(bounds), nil, clip.StrokeStyle{})
+			}
+			if paintState.intersect.Empty() {
+				break
+			}
+
+			// If the paint is a uniform opaque color that takes up the whole
+			// screen, it covers all previous paints and we can discard all
+			// rendering commands recorded so far.
+			if paintState.clip == nil && paintState.matType == materialColor && paintState.color.A == 255 {
+				c.clearColor = f32color.LinearFromSRGB(paintState.color).Opaque()
+				c.clear = true
+				c.paintOps = c.paintOps[:0]
+				break
+			}
+
+			// Flatten clip stack.
+			p := paintState.clip
+			startIdx := len(c.clipCmdCache)
+			for p != nil {
+				c.clipCmdCache = append(c.clipCmdCache, clipCmd{state: p, relTrans: p.relTrans})
+				p = p.parent
+			}
+			clipStack := c.clipCmdCache[startIdx:]
+			c.paintOps = append(c.paintOps, paintOp{
+				clipStack: clipStack,
+				state:     paintState,
+			})
+		case opconst.TypeSave:
+			id := ops.DecodeSave(encOp.Data)
+			c.save(id, state)
+		case opconst.TypeLoad:
+			id, mask := ops.DecodeLoad(encOp.Data)
+			s := c.states[id]
+			if mask&opconst.TransformState != 0 {
+				state.t = s.t
+			}
+			if mask&^opconst.TransformState != 0 {
+				state = s
+			}
+		}
+	}
+	for i := range c.paintOps {
+		op := &c.paintOps[i]
+		// For each clip, cull rectangular clip regions that contain its
+		// (transformed) bounds. addClip already handled the converse case.
+		// TODO: do better than O(n²) to efficiently deal with deep stacks.
+		for i := 0; i < len(op.clipStack)-1; i++ {
+			cl := op.clipStack[i]
+			p := cl.state
+			r := transformBounds(cl.relTrans, p.bounds)
+			for j := i + 1; j < len(op.clipStack); j++ {
+				cl2 := op.clipStack[j]
+				p2 := cl2.state
+				if len(p2.pathVerts) == 0 && r.In(p2.bounds) {
+					op.clipStack = append(op.clipStack[:j], op.clipStack[j+1:]...)
+					j--
+					op.clipStack[j].relTrans = cl2.relTrans.Mul(op.clipStack[j].relTrans)
+				}
+				r = transformRect(cl2.relTrans, r)
+			}
+		}
+	}
+}
+
+func (c *collector) encode(viewport image.Point, enc *encoder, texOps *[]textureOp) {
+	fview := f32.Rectangle{Max: layout.FPt(viewport)}
+	fillMode := scene.FillModeNonzero
+	if c.clear {
+		enc.rect(fview)
+		enc.fillColor(f32color.NRGBAToRGBA(c.clearColor.SRGB()))
+	}
+	for _, op := range c.paintOps {
+		// Fill in clip bounds, which the shaders expect to be the union
+		// of all affected bounds.
+		var union f32.Rectangle
+		for i, cl := range op.clipStack {
+			union = union.Union(cl.state.absBounds)
+			op.clipStack[i].union = union
+		}
+
+		var inv f32.Affine2D
+		for i := len(op.clipStack) - 1; i >= 0; i-- {
+			cl := op.clipStack[i]
+			if str := cl.state.stroke; str.Width > 0 {
+				enc.fillMode(scene.FillModeStroke)
+				enc.lineWidth(str.Width)
+				fillMode = scene.FillModeStroke
+			} else if fillMode != scene.FillModeNonzero {
+				enc.fillMode(scene.FillModeNonzero)
+				fillMode = scene.FillModeNonzero
+			}
+			enc.transform(cl.relTrans)
+			inv = inv.Mul(cl.relTrans)
+			if len(cl.state.pathVerts) == 0 {
+				enc.rect(cl.state.bounds)
+			} else {
+				enc.encodePath(cl.state.pathVerts)
+			}
+			if i != 0 {
+				enc.beginClip(cl.union)
+			}
+		}
+		if op.state.clip == nil {
+			// No clipping; fill the entire view.
+			enc.rect(fview)
+		}
+
+		switch op.state.matType {
+		case materialTexture:
+			// Add fill command. Its offset is resolved and filled in renderMaterials.
+			idx := enc.fillImage(0)
+			sx, hx, ox, hy, sy, oy := op.state.t.Elems()
+			// Separate integer offset from transformation. TextureOps that have identical transforms
+			// except for their integer offsets can share a transformed image.
+			intx, fracx := math.Modf(float64(ox))
+			inty, fracy := math.Modf(float64(oy))
+			t := f32.NewAffine2D(sx, hx, float32(fracx), hy, sy, float32(fracy))
+			*texOps = append(*texOps, textureOp{
+				sceneIdx: idx,
+				img:      op.state.image,
+				off:      image.Pt(int(intx), int(inty)),
+				key: textureKey{
+					transform: t,
+					handle:    op.state.image.handle,
+				},
+			})
+		case materialColor:
+			enc.fillColor(f32color.NRGBAToRGBA(op.state.color))
+		case materialLinearGradient:
+			// TODO: implement.
+			enc.fillColor(f32color.NRGBAToRGBA(op.state.color1))
+		default:
+			panic("not implemented")
+		}
+		enc.transform(inv.Invert())
+		// Pop the clip stack, except the first entry used for fill.
+		for i := 1; i < len(op.clipStack); i++ {
+			cl := op.clipStack[i]
+			enc.endClip(cl.union)
+		}
+	}
+}
+
+func (c *collector) save(id int, state encoderState) {
+	if extra := id - len(c.states) + 1; extra > 0 {
+		c.states = append(c.states, make([]encoderState, extra)...)
+	}
+	c.states[id] = state
+}
+
+func transformBounds(t f32.Affine2D, bounds f32.Rectangle) rectangle {
+	return rectangle{
+		t.Transform(bounds.Min), t.Transform(f32.Pt(bounds.Max.X, bounds.Min.Y)),
+		t.Transform(bounds.Max), t.Transform(f32.Pt(bounds.Min.X, bounds.Max.Y)),
+	}
+}
+
+func transformRect(t f32.Affine2D, r rectangle) rectangle {
+	var tr rectangle
+	for i, c := range r {
+		tr[i] = t.Transform(c)
+	}
+	return tr
+}
+
+func (r rectangle) In(b f32.Rectangle) bool {
+	for _, c := range r {
+		inside := b.Min.X <= c.X && c.X <= b.Max.X &&
+			b.Min.Y <= c.Y && c.Y <= b.Max.Y
+		if !inside {
+			return false
+		}
+	}
+	return true
+}
+
+func (r rectangle) Contains(b f32.Rectangle) bool {
+	return true
+}
+
+func (r rectangle) Bounds() f32.Rectangle {
+	bounds := f32.Rectangle{
+		Min: f32.Pt(math.MaxFloat32, math.MaxFloat32),
+		Max: f32.Pt(-math.MaxFloat32, -math.MaxFloat32),
+	}
+	for _, c := range r {
+		if c.X < bounds.Min.X {
+			bounds.Min.X = c.X
+		}
+		if c.Y < bounds.Min.Y {
+			bounds.Min.Y = c.Y
+		}
+		if c.X > bounds.Max.X {
+			bounds.Max.X = c.X
+		}
+		if c.Y > bounds.Max.Y {
+			bounds.Max.Y = c.Y
+		}
+	}
+	return bounds
+}

M gpu/gpu.go => gpu/gpu.go +17 -61

@@ 80,10 80,7 @@ type drawOps struct {
 	viewport   image.Point
 	clear      bool
 	clearColor f32color.RGBA
-	// allImageOps is the combined list of imageOps and
-	// zimageOps, in drawing order.
-	allImageOps []imageOp
-	imageOps    []imageOp
+	imageOps   []imageOp
 	// zimageOps are the rectangle clipped opaque images
 	// that can use fast front-to-back rendering with z-test
 	// and no blending.


@@ 92,9 89,6 @@ type drawOps struct {
 	pathOpCache []pathOp
 	qs          quadSplitter
 	pathCache   *opCache
-	// hack for the compute renderer to access
-	// converted path data.
-	compute bool
 }
 
 type drawState struct {


@@ 127,10 121,6 @@ type pathOp struct {
 	pathVerts []byte
 	parent    *pathOp
 	place     placement
-
-	// For compute
-	trans  f32.Affine2D
-	stroke clip.StrokeStyle
 }
 
 type imageOp struct {


@@ 174,9 164,6 @@ type material struct {
 	// For materialTypeTexture.
 	data    imageOpData
 	uvTrans f32.Affine2D
-
-	// For the compute backend.
-	trans f32.Affine2D
 }
 
 // clipOp is the shadow of clip.Op.


@@ 794,7 781,6 @@ func (d *drawOps) reset(cache *resourceCache, viewport image.Point) {
 	d.cache = cache
 	d.viewport = viewport
 	d.imageOps = d.imageOps[:0]
-	d.allImageOps = d.allImageOps[:0]
 	d.zimageOps = d.zimageOps[:0]
 	d.pathOps = d.pathOps[:0]
 	d.pathOpCache = d.pathOpCache[:0]


@@ 815,14 801,9 @@ func (d *drawOps) collect(ctx driver.Device, cache *resourceCache, root *op.Ops,
 	for _, p := range d.pathOps {
 		if v, exists := d.pathCache.get(p.pathKey); !exists || v.data.data == nil {
 			data := buildPath(ctx, p.pathVerts)
-			var computePath encoder
-			if d.compute {
-				computePath = encodePath(p.pathVerts)
-			}
 			d.pathCache.put(p.pathKey, opCacheValue{
-				data:        data,
-				bounds:      p.bounds,
-				computePath: computePath,
+				data:   data,
+				bounds: p.bounds,
 			})
 		}
 		p.pathVerts = nil


@@ 834,14 815,12 @@ func (d *drawOps) newPathOp() *pathOp {
 	return &d.pathOpCache[len(d.pathOpCache)-1]
 }
 
-func (d *drawOps) addClipPath(state *drawState, aux []byte, auxKey opKey, bounds f32.Rectangle, off f32.Point, tr f32.Affine2D, stroke clip.StrokeStyle) {
+func (d *drawOps) addClipPath(state *drawState, aux []byte, auxKey opKey, bounds f32.Rectangle, off f32.Point) {
 	npath := d.newPathOp()
 	*npath = pathOp{
 		parent: state.cpath,
 		bounds: bounds,
 		off:    off,
-		trans:  tr,
-		stroke: stroke,
 	}
 	state.cpath = npath
 	if len(aux) > 0 {


@@ 853,7 832,7 @@ func (d *drawOps) addClipPath(state *drawState, aux []byte, auxKey opKey, bounds
 	}
 }
 
-// split a transform into two parts, one which is pur offset and the
+// split a transform into two parts, one which is pure offset and the
 // other representing the scaling, shearing and rotation part
 func splitTransform(t f32.Affine2D) (srs f32.Affine2D, offset f32.Point) {
 	sx, hx, ox, hy, sy, oy := t.Elems()


@@ 924,9 903,7 @@ loop:
 						quads.aux, trans, op.outline, str,
 					)
 					op.bounds = bounds
-					if !d.compute {
-						quads.aux = pathData
-					}
+					quads.aux = pathData
 					// add it to the cache, without GPU data, so the transform can be
 					// reused.
 					d.pathCache.put(quads.key, opCacheValue{bounds: op.bounds})


@@ 937,7 914,7 @@ loop:
 				quads.key.SetTransform(trans) // TODO: This call has no effect.
 			}
 			state.clip = state.clip.Intersect(op.bounds.Add(off))
-			d.addClipPath(&state, quads.aux, quads.key, op.bounds, off, state.t, str)
+			d.addClipPath(&state, quads.aux, quads.key, op.bounds, off)
 			quads = quadsOp{}
 			str = clip.StrokeStyle{}
 


@@ 978,16 955,15 @@ loop:
 				// this transformed rectangle.
 				k := opKey{Key: encOp.Key}
 				k.SetTransform(trans) // TODO: This call has no effect.
-				d.addClipPath(&state, clipData, k, bnd, off, state.t, clip.StrokeStyle{})
+				d.addClipPath(&state, clipData, k, bnd, off)
 			}
 
 			bounds := boundRectF(cl)
-			mat := state.materialFor(bnd, off, partialTrans, bounds, state.t)
+			mat := state.materialFor(bnd, off, partialTrans, bounds)
 
 			if bounds.Min == (image.Point{}) && bounds.Max == d.viewport && state.rect && mat.opaque && (mat.material == materialColor) {
 				// The image is a uniform opaque color and takes up the whole screen.
 				// Scrap images up to and including this image and set clear color.
-				d.allImageOps = d.allImageOps[:0]
 				d.zimageOps = d.zimageOps[:0]
 				d.imageOps = d.imageOps[:0]
 				z = 0


@@ 1011,7 987,6 @@ loop:
 				material: mat,
 			}
 
-			d.allImageOps = append(d.allImageOps, img)
 			if state.rect && img.material.opaque {
 				d.zimageOps = append(d.zimageOps, img)
 			} else {


@@ 1049,7 1024,7 @@ func expandPathOp(p *pathOp, clip image.Rectangle) {
 	}
 }
 
-func (d *drawState) materialFor(rect f32.Rectangle, off f32.Point, partTrans f32.Affine2D, clip image.Rectangle, trans f32.Affine2D) material {
+func (d *drawState) materialFor(rect f32.Rectangle, off f32.Point, partTrans f32.Affine2D, clip image.Rectangle) material {
 	var m material
 	switch d.matType {
 	case materialColor:


@@ 1084,7 1059,6 @@ func (d *drawState) materialFor(rect f32.Rectangle, off f32.Point, partTrans f32
 		sr.Max.Y -= float32(dr.Max.Y-clip.Max.Y) * sdy / dy
 		uvScale, uvOffset := texSpaceTransform(sr, sz)
 		m.uvTrans = partTrans.Mul(f32.Affine2D{}.Scale(f32.Point{}, uvScale).Offset(uvOffset))
-		m.trans = trans
 		m.data = d.image
 	}
 	return m


@@ 1429,31 1403,13 @@ func (d *drawOps) boundsForTransformedRect(r f32.Rectangle, tr f32.Affine2D) (au
 
 	// build the GPU vertices
 	l := len(d.vertCache)
-	if !d.compute {
-		d.vertCache = append(d.vertCache, make([]byte, vertStride*4*4)...)
-		aux = d.vertCache[l:]
-		encodeQuadTo(aux, 0, corners[0], corners[0].Add(corners[1]).Mul(0.5), corners[1])
-		encodeQuadTo(aux[vertStride*4:], 0, corners[1], corners[1].Add(corners[2]).Mul(0.5), corners[2])
-		encodeQuadTo(aux[vertStride*4*2:], 0, corners[2], corners[2].Add(corners[3]).Mul(0.5), corners[3])
-		encodeQuadTo(aux[vertStride*4*3:], 0, corners[3], corners[3].Add(corners[0]).Mul(0.5), corners[0])
-		fillMaxY(aux)
-	} else {
-		d.vertCache = append(d.vertCache, make([]byte, (scene.CommandSize+4)*4)...)
-		aux = d.vertCache[l:]
-		buf := aux
-		bo := binary.LittleEndian
-		bo.PutUint32(buf, 0) // Contour
-		ops.EncodeCommand(buf[4:], scene.Line(r.Min, f32.Pt(r.Max.X, r.Min.Y)))
-		buf = buf[4+scene.CommandSize:]
-		bo.PutUint32(buf, 0)
-		ops.EncodeCommand(buf[4:], scene.Line(f32.Pt(r.Max.X, r.Min.Y), r.Max))
-		buf = buf[4+scene.CommandSize:]
-		bo.PutUint32(buf, 0)
-		ops.EncodeCommand(buf[4:], scene.Line(r.Max, f32.Pt(r.Min.X, r.Max.Y)))
-		buf = buf[4+scene.CommandSize:]
-		bo.PutUint32(buf, 0)
-		ops.EncodeCommand(buf[4:], scene.Line(f32.Pt(r.Min.X, r.Max.Y), r.Min))
-	}
+	d.vertCache = append(d.vertCache, make([]byte, vertStride*4*4)...)
+	aux = d.vertCache[l:]
+	encodeQuadTo(aux, 0, corners[0], corners[0].Add(corners[1]).Mul(0.5), corners[1])
+	encodeQuadTo(aux[vertStride*4:], 0, corners[1], corners[1].Add(corners[2]).Mul(0.5), corners[2])
+	encodeQuadTo(aux[vertStride*4*2:], 0, corners[2], corners[2].Add(corners[3]).Mul(0.5), corners[3])
+	encodeQuadTo(aux[vertStride*4*3:], 0, corners[3], corners[3].Add(corners[0]).Mul(0.5), corners[0])
+	fillMaxY(aux)
 
 	// establish the transform mapping from bounds rectangle to transformed corners
 	var P1, P2, P3 f32.Point