A classic problem in complex manufacturing is forecasting how long it will take to build something. A Boeing 747 has an estimated 6 million individual parts. Every build, inspection, or purchase forms a complex web of dependencies, making it unclear where the bottlenecks are, and making it difficult to forecast production schedules. Software tools should make it easy to not only predict completion dates, but also to experiment with different strategies to speed up production.
When it comes to software tools, iteration time is key for a pleasant and productive workflow, so that users can focus on hard manufacturing problems, and not be waiting on slow software.
For companies with less mature production cycles, even just surfacing the dependencies in an understandable way, and showing how they change with modifications to a manufacturing plan can be extrememly valuable.
A key part of any scheduling algorithm is sorting the production graph in dependency order. This is called a topological sort.
In order to provide the best user experience possible, I benchmarked three approaches:
1. SQL (Recursive CTE)Just like real world engineering, predicting complex systems like relational databases is hard to do, so coming up with quick experiments to test your hypothesis is crucial to making good decisions.
┌─────────────────┬───────────┐
│ Nodes + Edges │ Time │
├─────────────────┼───────────┤
│ 10 + 20 │ 1ms │
│ 100 + 2,500 │ 70ms │
│ 200 + 10,000 │ 900ms │
│ 500 + 62,000 │ 32,000ms │
└─────────────────┴───────────┘
┌─────────────────┬───────────┐
│ Nodes + Edges │ Time │
├─────────────────┼───────────┤
│ 10 + 20 │ 5ms │
│ 100 + 2,500 │ 40ms │
│ 200 + 10,000 │ 240ms │
│ 500 + 62,000 │ 4,000ms │
└─────────────────┴───────────┘
┌─────────────────┬─────────────────────────┐
│ Nodes + Edges │ Time │
├─────────────────┼─────────────────────────┤
│ 10 + 20 │ 1ms (1ms network) │
│ 100 + 2,500 │ 1ms (1ms network) │
│ 200 + 10,000 │ 3ms (2ms network) │
│ 500 + 62,000 │ 26ms (22ms network) │
└─────────────────┴─────────────────────────┘
The recursive CTE ended up being unusably slow for large graphs, taking over 30 seconds for the largest one. Meanwhile plpgsql is faster than I gave it credit for. On small to medium graphs it could be sufficient, but it still doesn't perform well with large graphs. The Golang sort was incredibly fast, even on large graphs the sort part only took 4ms, compared to the recursive CTE which took 32 seconds (almost 1000x slower). In hindsight, I overestimated the network overhead relative to the slowness of plpgsql and SQL. The ideal approach would hold the graph in memory, so that incremental changes to the graph don't trigger a network roundtrip each time. With a few more optimization to my quick and dirty Golang implementation I could probably get the graph updating with every keystroke.
create or replace function topological_sort()
returns table (sorted_node int) as $$
declare
n int;
m int;
edge_count int;
s int[] := ( -- set of nodes with no incoming edges
select array_agg(distinct e1.val)
from edges e1
left join edges e2 on e1.val=e2.next
where e2.val is null);
l int[] := '{}'; -- list of sorted elements
temp int[] := '{}';
counter int := 0;
begin
while array_length(s, 1) is not null loop
n := s[1];
s := array_remove(s, n);
l := l || n;
counter := counter + 1;
drop table if exists deleted_edges;
create temp table deleted_edges as
with d_cte as (
delete from edges
where val=n
returning next
)
select next from d_cte;
select array_agg(distinct de.next)
into temp
from deleted_edges de
left join edges e2 on e2.next=de.next
where e2.val is null;
s := s || temp;
end loop;
select count(*) into edge_count from edges;
if edge_count > 0 then
raise exception 'graph has at least one cycle';
end if;
return query select unnest(l);
end;
$$ language plpgsql;
do $$
declare
i int := 1;
j int := 2;
branching_chance int=50;
depths int[] := array[10, 100, 200, 500];
depth int;
start_time timestamp;
end_time timestamp;
begin
-- loop over test sizes
foreach depth in array depths loop
-- Set up test data
drop table if exists edges;
create table edges (
val int not null,
next int not null
);
truncate table edges;
for i in 1..depth loop
for j in i+1..depth loop
insert into edges
(val, next)
select i, j
where floor(random() * 100) < branching_chance;
end loop;
end loop;
create index val_idx on edges (val) include (next);
create index next_idx on edges (next) include (val);
raise log 'NUM EDGES: %', (select count(*) from edges);
-- recursive CTE
start_time := clock_timestamp();
drop table if exists results_cte;
create temp table results_cte as
select * from (
with recursive graph as (
select e.val, e.next, 0 depth
from edges e
left join edges e2 on e2.next=e.val
where e2.next is null
union
select e.val, e.next, g.depth + 1 depth
from graph g
join edges e on g.next=e.val
)
select g.val, max(g.depth) max_depth
from graph g
group by g.val
order by max_depth
) a;
end_time := clock_timestamp();
raise log 'CTE: %ms',
round(extract(epoch from (end_time - start_time)) * 1000, 2);
-- plpgsql
start_time := clock_timestamp();
drop table if exists results_proc;
create table results_proc as
select sorted_node from topological_sort();
end_time := clock_timestamp();
raise log 'PLPGSQL: %ms',
round(extract(epoch from (end_time - start_time)) * 1000, 2);
end loop;
end;$$;
package main
import (
"container/list"
"context"
"fmt"
"log"
"os"
"time"
"github.com/jackc/pgx/v5"
)
func Must(err error) {
if err != nil {
log.Fatal(err)
}
}
type Edge struct {
Val int
Next int
}
func main() {
ctx := context.Background()
conn, err := pgx.Connect(ctx, os.Getenv("DATABASE_URL"))
Must(err)
defer conn.Close(context.Background())
depths := []int{10, 100, 200, 500, 2000}
for _, depth := range depths {
// set up test data
_, err := conn.Exec(ctx, fmt.Sprintf(`
do $$ begin
drop table if exists edges;
create table edges (
val int not null,
next int not null
);
for i in 1..%v loop
for j in i+1..%v loop
insert into edges
(val, next)
select i, j
where floor(random() * 100) < 50;
end loop;
end loop;
create index val_idx on edges (val) include (next);
create index next_idx on edges (next) include (val);
end; $$;
`, depth, depth))
Must(err)
r := conn.QueryRow(ctx, "select count(*) from edges")
Must(err)
var num_edges int
r.Scan(&num_edges)
fmt.Printf("NUM EDGES: %v\n", num_edges)
start := time.Now()
rows, err := conn.Query(ctx, "select val, next from edges;")
Must(err)
edges, err := pgx.CollectRows(rows, pgx.RowToStructByName[Edge])
Must(err)
query_time := time.Since(start)
sort_start := time.Now()
topo_sort(edges)
sort_time := time.Since(sort_start)
// TODO: write them back to DB
total_time := time.Since(start)
fmt.Printf("Query: %v Sort: %v Total: %v\n", query_time, sort_time, total_time)
}
}
func topo_sort(edges []Edge) []int {
parents := make(map[int][]int)
children := make(map[int][]int)
in_degree := make(map[int]int)
for _, e := range edges {
parents[e.Next] = append(parents[e.Next], e.Val) // entries won't exist for root nodes
children[e.Val] = append(children[e.Val], e.Next) // entries won't exist for leaf nodes
if _, exists := in_degree[e.Val]; !exists {
in_degree[e.Val] = 0
}
in_degree[e.Next] += 1
}
S := list.New()
for key := range in_degree {
if in_degree[key] == 0 {
S.PushFront(key)
}
}
result := make([]int, 0)
for S.Front() != nil {
n_el := S.Front()
S.Remove(n_el)
n := n_el.Value.(int)
result = append(result, n)
for _, m := range children[n] {
in_degree[m] -= 1
if in_degree[m] == 0 {
S.PushBack(m)
}
}
}
return result
}