taskgraph/graph.go at master · thought-machine/taskgraph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
package taskgraph

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"regexp"
	"runtime/debug"
	"sort"
	"strings"
	"time"

	set "github.com/deckarep/golang-set/v2"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/trace"
	"go.opentelemetry.io/otel/trace/noop"
	"golang.org/x/sync/errgroup"
)

const taskLimit = 1000

var (
	// ErrExposedKeyNotProvided is returned from Graph.AsTask() when a key requested to be exposed is
	// not provided by any task in the graph.
	ErrExposedKeyNotProvided = errors.New("key(s) exposed but not provided by graph")

	// ErrDuplicateTaskNames is returned from New() if multiple tasks with the same name are passed to
	// it.
	ErrDuplicateTaskNames = errors.New("duplicate task names")

	// ErrDuplicateProvidedKeys is returned from New() if multiple tasks provide the same key.
	ErrDuplicateProvidedKeys = errors.New("keys provided by multiple tasks")

	// ErrGraphCycle is returned from New() if there is a cycle in the graph tasks (i.e. if a task A
	// depends on a key which is produced by some task B which depends indirectly on a key produced by
	// task A).
	ErrGraphCycle = errors.New("found cycle in graph")

	// ErrTooManyTasks is returned from New() if too many tasks are passed to it. This is a sanity
	// check to avoid taking too long to check for cycles. The limit could be increased if the cycle
	// checking is optimised.
	ErrTooManyTasks = wrapStackErrorf("too many tasks in graph (limit %d)", taskLimit)

	// ErrMissingInputs is returned from Graph.Run() if the provided inputs do not satisfy all of the
	// graph's dependencies (i.e. all task dependencies that are not provided by some other task in
	// the graph).
	ErrMissingInputs = errors.New("missing inputs")
)

// A Graph represents a declarative workflow of tasks.
type Graph interface {
	// Check whether the given input bindings are sufficient to run the graph.
	//
	// This is intended to be run by a genrule at build time to assert that all keys required by tasks
	// in the graph are provided either as an input or by some other task in the graph. It also checks
	// that there are no duplicate inputs.
	Check(inputs ...Binding) error

	// Run executes the task graph with the given inputs, returning a Binder containing the bound
	// values from all tasks (but not any of the input bindings).
	//
	// It is advisable to set a timeout on the passed context, although it is up to the individual
	// tasks to listen for context cancellation.
	Run(ctx context.Context, inputs ...Binding) (Binder, error)

	// AsTask produces a Task which runs this Graph in full to allow composition of graphs. The task
	// depends on all keys which are required by any task within it and not provided by any task
	// within it. The task provides only the key IDs passed to this method; and only their bindings
	// will be available in the result of any graph the task is included in (any bindings produced by
	// tasks within this graph whose IDs were not passed to this method will be suppressed).
	//
	// Bindings for the exposed keys are added to the binder of the parent task as soon as they are
	// generated by tasks within this graph, which means that tasks outside this graph which depend on
	// the exposed keys can start running as soon as the producing task completes, rather than waiting
	// for this entire task to complete.
	AsTask(exposeKeys ...ID) (Task, error)

	// Graphviz produces a graphviz representation of the graph, with the tasks as nodes and the
	// dependencies as edges. This output can be pased into tools like
	// https://dreampuf.github.io/GraphvizOnline or https://dot-to-ascii.ggerganov.com/ to view the
	// structure of the graph.
	//
	// The includeInputs parameter controls whether graph inputs are included in the output; including
	// them tends to make the graph significantly more complicated and harder for the graphviz engine
	// to lay out in a useful way.
	Graphviz(includeInputs bool) string
}

type runState struct {
	Binder
	signals map[string]chan struct{}
}

func (rs *runState) signal(ctx context.Context, childID string) (err error) {
	// Capture (and ignore) panics caused by sends to closed channels, which may occur if a task
	// starts slowly and sees that all of its dependencies are available before we can signal it.
	defer func() {
		if r := recover(); r != nil {
			if !strings.Contains(fmt.Sprintf("%v", r), "send on closed channel") {
				err = wrapStackErrorf("recovered from panic in signal(): %v\n%s", r, debug.Stack())
			}
		}
	}()

	signal, ok := rs.signals[childID]
	if !ok {
		return wrapStackErrorf("signal channel missing for id %q", childID)
	}
	select {
	case <-ctx.Done():
		return ctx.Err()
	case signal <- struct{}{}:
		return nil
	}
}

type graphNode struct {
	// id is a sanitized version of task.Name() which is safe to use in graphviz.
	id              string
	task            Task
	dependents      []*graphNode
	dependentsByKey map[ID][]*graphNode
	tracer          trace.Tracer
	logger          Logger
}

const (
	traceTaskgraphAbsentKeysPrefix = "taskgraph.absent_keys."
)

// Execute the task against the binder provided in the runState.
//
// This assumes that all of the task's dependencies have been bound; it is the responsibility of the
// task to declare its dependencies, and of the framework to check those dependencies have be bound
// before calling this function.
//
// As a sanity check, this function asserts that the task returns bindings for exactly the key IDs
// which it has declared to provide. Any missing or extra bindings will cause an error to be
// returned.
//
// Once the task has been executed successfully, its dependents are signalled so that they can check
// if they are ready to run.
func (gn *graphNode) execute(ctx context.Context, rs *runState) (err error) {
	// We close the channel used for signalling this task to prevent other tasks deadlocking trying to
	// signal it. Starting a task implies that all of its dependencies are ready, and so it *should*
	// never be signalled again, but if a task starts slowly, it may see that all of its dependencies
	// are available and start executing without receiving from the signal channel.
	close(rs.signals[gn.id])

	tCtx, span := gn.tracer.Start(ctx, gn.task.Name())
	defer span.End()

	gn.logger.Debugf("Starting task %s", gn.task.Name())
	defer gn.logger.Debugf("Finished task %s", gn.task.Name())

	bindings, err := gn.task.Execute(tCtx, rs)
	if err != nil {
		span.RecordError(err)
		return wrapStackErrorf("task %s: %w", gn.task.Name(), err)
	}
	if err := rs.Store(bindings...); err != nil {
		return wrapStackErrorf("task %s: %w", gn.task.Name(), err)
	}

	var missing []string
	for _, p := range gn.task.Provides() {
		if !rs.Has(p) {
			missing = append(missing, p.String())
		}
	}
	var extra []string
	errors := []string{}
	providesSet := set.NewSet[ID](gn.task.Provides()...)
	for _, binding := range bindings {
		if !providesSet.Contains(binding.ID()) {
			extra = append(extra, binding.ID().String())
		}

		if binding.Status() == Absent {
			err := binding.Error()
			if err != nil {
				errors = append(errors, fmt.Sprintf("[%s: %s]", binding.ID().String(), err))
			}

			span.SetAttributes(
				attribute.String(
					traceTaskgraphAbsentKeysPrefix+binding.ID().String(),
					fmt.Sprintf("%v", err),
				),
			)
		}
	}

	if len(extra) > 0 || len(missing) > 0 {
		return wrapStackErrorf(
			"task %s: mismatch between task Provides declaration and returned bindings: missing bindings [%s], got extra bindings [%s]",
			gn.task.Name(),
			strings.Join(missing, ", "),
			strings.Join(extra, ", "),
		)
	}

	if len(errors) > 0 {
		gn.logger.Debugf(
			"task %s has binding errors: %s",
			gn.task.Name(),
			strings.Join(errors, ", "),
		)
	}

	for _, dependent := range gn.dependents {
		gn.logger.Debugf("task %s signalling dependent %s\n", gn.task.Name(), dependent.task.Name())
		if err := rs.signal(tCtx, dependent.id); err != nil {
			return err
		}
	}
	return nil
}

// A task can be executed if all of its dependencies have been bound.
func (gn *graphNode) canExecute(b Binder) bool {
	return b.Has(gn.task.Depends()...)
}

// Returns a function which can be passed to errgroup.Group.Go. That function checks if the task can
// be executed immediately; if not, it waits to be signalled, checking whether it's dependencies are
// available each time it receives a signal until the task can be executed.
func (gn *graphNode) runFunc(ctx context.Context, rs *runState) func() error {
	return func() error {
		if gn.canExecute(rs) {
			gn.logger.Debugf("task %s starting immediately\n", gn.task.Name())
			return gn.execute(ctx, rs)
		}
		gn.logger.Debugf("task %s has dependencies missing; cannot start immediately\n",
			gn.task.Name())
		signal, ok := rs.signals[gn.id]
		if !ok {
			return wrapStackErrorf("signal channel missing for id %q", gn.id)
		}
		for {
			select {
			case <-signal:
				if gn.canExecute(rs) {
					gn.logger.Debugf("task %s starting\n", gn.task.Name())
					return gn.execute(ctx, rs)
				}
				gn.logger.Debugf("task %s still has dependencies missing\n", gn.task.Name())
			case <-ctx.Done():
				return nil
			}
		}
	}
}

type graph struct {
	name                         string
	tasks                        []Task
	allDependencies, allProvided set.Set[ID]
	nodes                        []*graphNode
	tracer                       trace.Tracer
	logger                       Logger
}

func (g *graph) buildInputBinder(inputs ...Binding) (Binder, error) {
	b := NewBinder()

	if err := b.Store(inputs...); err != nil {
		return nil, wrapStackErrorf("duplicate input: %w", err)
	}

	var missingInputs []string
	for requiredInput := range g.allDependencies.Difference(g.allProvided).Iter() {
		if !b.Has(requiredInput) {
			missingInputs = append(missingInputs, requiredInput.String())
		}
	}
	if len(missingInputs) > 0 {
		return nil, wrapStackErrorf("%w: %s", ErrMissingInputs, strings.Join(missingInputs, ", "))
	}

	return b, nil
}

// Check is Graph.Check.
func (g *graph) Check(inputs ...Binding) error {
	_, err := g.buildInputBinder(inputs...)
	return err
}

// Run is Graph.Run.
func (g *graph) Run(ctx context.Context, inputs ...Binding) (b Binder, err error) {
	startTime := time.Now()
	defer func() {
		result := "success"
		if err != nil {
			result = "error"
		}
		executionLatency.WithLabelValues(g.name, result).
			Observe(float64(time.Since(startTime) / time.Millisecond))
	}()
	base, err := g.buildInputBinder(inputs...)
	if err != nil {
		return nil, err
	}

	outputs := NewBinder()
	overlay := &overlayBinder{
		base:    base,
		overlay: outputs,
	}

	tCtx, span := g.tracer.Start(ctx, g.name)
	defer span.End()
	if err := g.runWithBinder(tCtx, overlay); err != nil {
		span.RecordError(err)
		return nil, err
	}

	return outputs, nil
}

// Sets up the per-run state of the graph, and runs all of the tasks in their own goroutines until
// all have terminated. If any task returns an error, the entire graph run is cancelled.
func (g *graph) runWithBinder(ctx context.Context, binder Binder) error {
	rs := &runState{
		Binder:  binder,
		signals: map[string]chan struct{}{},
	}
	for _, gn := range g.nodes {
		rs.signals[gn.id] = make(chan struct{})
	}

	// errgroup always cancels the derived context before returning from Wait(), so the select below
	// must listen to the parent context's Done() channel.
	eg, egCtx := errgroup.WithContext(ctx)

	for _, gn := range g.nodes {
		eg.Go(gn.runFunc(egCtx, rs))
	}

	errCh := make(chan error)

	go func() {
		errCh <- eg.Wait()
	}()

	select {
	case err := <-errCh:
		return err
	case <-ctx.Done():
		return ctx.Err()
	}
}

func (g *graph) AsTask(exposeKeys ...ID) (Task, error) {
	depends := g.allDependencies.Difference(g.allProvided).ToSlice()
	exposeSet := set.NewSet[ID](exposeKeys...)
	if difference := exposeSet.Difference(g.allProvided); difference.Cardinality() > 0 {
		var missing []string
		for id := range difference.Iter() {
			missing = append(missing, id.String())
		}
		return nil, wrapStackErrorf("%w: %s", ErrExposedKeyNotProvided, strings.Join(missing, ", "))
	}

	return NewTask(g.name, func(ctx context.Context, external Binder) ([]Binding, error) {
		gtb := &graphTaskBinder{
			internal:   NewBinder(),
			external:   external,
			exposeKeys: exposeSet,
		}

		if err := g.runWithBinder(ctx, gtb); err != nil {
			return nil, err
		}

		// Defensive sanity check that all the keys which should be exposed were bound and stored in the
		// external Binder. Missing bindings should never happen, and if it does it implies a fault in
		// the AsTask logic.
		var missing []string
		for _, id := range exposeKeys {
			binding := external.Get(id)
			if binding.Status() == Pending {
				missing = append(missing, id.String())
			}
		}
		if len(missing) > 0 {
			return nil, wrapStackErrorf(
				"exposed key(s) not bound after graph execution: %s",
				strings.Join(missing, ", "),
			)
		}

		// The exposed keys are added to the external binder via the graphTaskBinder, so we don't return
		// any bindings here (as to do so would cause a duplicate binding error).
		return nil, nil
	}, depends, exposeKeys), nil
}

func (g *graph) Graphviz(includeInputs bool) string {
	var nodes []string
	var edges []string

	for _, n := range g.nodes {
		nodes = append(nodes, fmt.Sprintf("  %s [label=\"%s\"];", n.id, n.task.Name()))
		if includeInputs {
			for _, dep := range n.task.Depends() {
				if !g.allProvided.Contains(dep) {
					inputID := fmt.Sprintf("%s_input_%s", n.id, dep.id)
					nodes = append(
						nodes,
						fmt.Sprintf("  %s [label=\"Input - %s\", shape=diamond];", inputID, dep),
					)
					edges = append(edges, fmt.Sprintf("  %s -> %s;", inputID, n.id))
				}
			}
		}
		for k, deps := range n.dependentsByKey {
			for _, dep := range deps {
				edges = append(edges, fmt.Sprintf("  %s -> %s [label=\"%s\"];", n.id, dep.id, k))
			}
		}
		for _, dep := range n.task.Provides() {
			if !g.allDependencies.Contains(dep) {
				outputID := fmt.Sprintf("%s_output_%s", n.id, dep)
				nodes = append(
					nodes,
					fmt.Sprintf("  %s [label=\"Output\", shape=diamond];", outputID),
				)
				edges = append(
					edges,
					fmt.Sprintf("  %s -> %s [label=\"%s\"];", n.id, outputID, dep),
				)
			}
		}
	}

	sort.Strings(nodes)
	sort.Strings(edges)

	buf := new(bytes.Buffer)
	fmt.Fprintln(buf, "digraph G {")
	fmt.Fprintln(buf, strings.Join(nodes, "\n"))
	fmt.Fprintln(buf)
	fmt.Fprintln(buf, strings.Join(edges, "\n"))
	fmt.Fprintln(buf, "}")
	return buf.String()
}

// Logger logger interface for the graph.
type Logger interface {
	Debugf(format string, args ...interface{})
}

type graphOptions struct {
	tasks  []Task
	tracer trace.Tracer
	logger Logger
}

// A GraphOption is used to configure a new Graph.
type GraphOption func(opts *graphOptions) error

// WithTasks sets the tasks which form the graph.
func WithTasks(tasks ...TaskSet) GraphOption {
	return func(opts *graphOptions) error {
		opts.tasks = taskset(tasks).Tasks()

		if len(opts.tasks) > taskLimit {
			return ErrTooManyTasks
		}

		return nil
	}
}

// WithTracer sets a tracer to record graph execution.
func WithTracer(tracer trace.Tracer) GraphOption {
	return func(opts *graphOptions) error {
		opts.tracer = tracer

		return nil
	}
}

// WithLogger sets a logger for the graph.
func WithLogger(logger Logger) GraphOption {
	return func(opts *graphOptions) error {
		opts.logger = logger

		return nil
	}
}

// New creates a new Graph. Exactly one WithTasks option should be passed.
//
// Ideally, Graphs should be created on program startup, rather than creating them dynamically.
func New(name string, opts ...GraphOption) (Graph, error) {
	o := &graphOptions{
		tracer: noop.NewTracerProvider().Tracer("github.com/thought-machine/taskgraph"),
	}

	for _, opt := range opts {
		if err := opt(o); err != nil {
			return nil, err
		}
	}

	if o.logger == nil {
		o.logger = log
	}

	g := &graph{
		name:            name,
		tasks:           o.tasks,
		allDependencies: set.NewSet[ID](),
		allProvided:     set.NewSet[ID](),
		tracer:          o.tracer,
		logger:          o.logger,
	}

	provideTasks := map[string][]string{}
	taskLocations := map[string][]string{}
	nodesByDep := map[ID][]*graphNode{}

	var badTaskErrs error
	for _, t := range g.tasks {
		if t.Name() == "" || t.Location() == "" {
			badTaskErrs = errors.Join(
				badTaskErrs,
				fmt.Errorf("tasks must have a name and location: (%s, %s)", t.Name(), t.Location()),
			)
		}
		node := &graphNode{
			id:              sanitizeTaskName(t.Name()),
			task:            t,
			dependentsByKey: map[ID][]*graphNode{},
			tracer:          g.tracer,
			logger:          g.logger,
		}
		g.nodes = append(g.nodes, node)

		taskLocations[t.Name()] = append(taskLocations[t.Name()], t.Location())

		g.allDependencies.Append(t.Depends()...)
		for _, dep := range t.Depends() {
			nodesByDep[dep] = append(nodesByDep[dep], node)
		}

		g.allProvided.Append(t.Provides()...)
		for _, id := range t.Provides() {
			provideTasks[id.String()] = append(
				provideTasks[id.String()],
				fmt.Sprintf("%s - %s", t.Name(), t.Location()),
			)
		}
	}
	if badTaskErrs != nil {
		return nil, badTaskErrs
	}
	var duplicateTaskNames []string
	for name, locations := range taskLocations {
		if len(locations) > 1 {
			duplicateTaskNames = append(
				duplicateTaskNames,
				fmt.Sprintf("%s (%s)", name, strings.Join(locations, ", ")),
			)
		}
	}
	if len(duplicateTaskNames) > 0 {
		return nil, wrapStackErrorf(
			"%w: %s",
			ErrDuplicateTaskNames,
			strings.Join(duplicateTaskNames, ", "),
		)
	}
	var duplicateProvides []string
	for id, tasks := range provideTasks {
		if len(tasks) > 1 {
			duplicateProvides = append(
				duplicateProvides,
				fmt.Sprintf("%s (%s)", id, strings.Join(tasks, ", ")),
			)
		}
	}
	if len(duplicateProvides) > 0 {
		return nil, wrapStackErrorf(
			"%w: %s",
			ErrDuplicateProvidedKeys,
			strings.Join(duplicateProvides, ", "),
		)
	}

	for _, node := range g.nodes {
		seen := map[string]bool{}
		for _, p := range node.task.Provides() {
			for _, dependent := range nodesByDep[p] {
				if !seen[dependent.task.Name()] {
					seen[dependent.task.Name()] = true
					node.dependents = append(node.dependents, dependent)
				}
				node.dependentsByKey[p] = append(node.dependentsByKey[p], dependent)
			}
		}
	}

	// For each node in the graph, we do a depth first search and check if we reach a node we have
	// seen before. This is an O(n^3) algorithm which may require optimisation if we see large graphs,
	// but a basic benchmark suggests that a 1000 node graph can be checked in ~485ms in the worst
	// case of a perfectly linear graph, which is an acceptable cost for something which should only
	// be done once at program startup.
	//
	// A seemingly obvious optimisation would be to only check for cycles starting from "source" nodes
	// which do not depend on any key provided by a task in the graph, but that would miss simple
	// loops. A more promising optimisation would be to cache which nodes are reachable from each
	// node, to avoid repeatedly traversing parts of the graph.
	for _, node := range g.nodes {
		if err := checkCycle(node, nil); err != nil {
			return nil, err
		}
	}

	return g, nil
}

var sanitizeRegex = regexp.MustCompile("[^a-zA-Z0-9]+")

func sanitizeTaskName(name string) string {
	return sanitizeRegex.ReplaceAllString(name, "_")
}

func checkCycle(node *graphNode, path []string) error {
	for i := len(path) - 1; i >= 0; i-- {
		if path[i] == node.task.Name() {
			return wrapStackErrorf(
				"%w: %s",
				ErrGraphCycle,
				strings.Join(append(path[i:], path[i]), " -> "),
			)
		}
	}
	path = append(path, node.task.Name())
	for _, dependent := range node.dependents {
		if err := checkCycle(dependent, path); err != nil {
			return err
		}
	}
	return nil
}