Adrian Prantl
TU Vienna
For this course, inspired by this paper I wrote an optimal instruction scheduler for the VEX processor family using the great clpfd Prolog library.
Sources are here: sched.pl!
#!/usr/bin/pl -q -t main -f
% -*- prolog -*-
% sched.pl -- an optimal instruction scheduler for the 4xVEX
%
% Copyright (C) June 2008, Adrian Prantl
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program. If not, see <http://www.gnu.org/licenses/>
:- use_module(library(ugraphs)),
use_module(library(clpfd)).
%-----------------------------------------------------------------------
% Basic Block Schedule
%-----------------------------------------------------------------------
% Machine description
opcode_latency(mpyll, 3).
opcode_latency(Load, 3) :- is_load(Load), !.
opcode_latency(Store, 1) :- is_store(Store), !.
opcode_latency(_, 1).
branch_delay(asm_op(_, return, _, _, _), 4).
branch_delay(asm_op(_, goto, _, _, _), 4).
branch_delay(asm_op(_, br, _, _, _), 4).
branch_delay(asm_op(_, brf, _, _, _), 4).
barrier(label(_)).
barrier(asm_op(_, call, _)).
barrier(asm_op(_, return, _)).
barrier(asm_op(_, goto, _)).
barrier(asm_op(_, br, _)).
barrier(asm_op(_, brf, _)).
is_load('ldb').
is_load('ldb.d').
is_load('ldbu').
is_load('ldbu.d').
is_load('ldh').
is_load('ldh.d').
is_load('ldhu').
is_load('ldhu.d').
is_load('ldw').
is_load('ldw.d').
is_load('ldb.s').
is_load('ldb.s.d').
is_load('ldb.su').
is_load('ldb.su.d').
is_load('ldh.s').
is_load('ldh.s.d').
is_load('ldhu.s').
is_load('ldhu.s.d').
is_load('ldw.s').
is_load('ldw.s.d').
is_store(pft).
is_store(stb).
is_store(sth).
is_store(stw).
is_store('pft.s').
is_store('stb.s').
is_store('sth.s').
is_store('stw.s').
waw_constr(asm_op(_,OpCode,_,_,Pos1), def(O), def(O,asm_op(_,_,_,_,Pos2))) :- !,
opcode_latency(OpCode, Latency),
Pos2 #>= Pos1 + Latency.
waw_constr(_,_,_).
raw_constr(asm_op(_,OpCode,_,_,Pos1), def(O), use(O,asm_op(_,_,_,_,Pos2))) :- !,
opcode_latency(OpCode, Latency),
Pos2 #>= Pos1 + Latency.
raw_constr(_,_,_).
war_constr(asm_op(_,OpCode,_,_,Pos1), use(U), def(U,asm_op(_,_,_,_,Pos2))) :- !,
opcode_latency(OpCode, Latency),
Pos2 #>= Pos1 + Latency - 1.
war_constr(_,_,_).
collect_d(Op, Last, Defs, Uses) :-
Op = asm_op(N, OpCode, [D|Ds], Us, Pos1),
maplist(waw_constr(Op, D), Defs),
maplist(raw_constr(Op, D), [use(_,asm_op(_,_,_,_,Last))|Uses]),
collect_d(asm_op(N, OpCode, Ds, Us, Pos1), Last, Defs, Uses).
collect_d(asm_op(_, _, [], _, _), _, _, _).
collect_u(Op, Last, Defs, Uses) :-
Op = asm_op(N, OpCode, Ds, [U|Us], Pos1),
maplist(war_constr(Op, U), Defs),
collect_d(asm_op(N, OpCode, Ds, Us, Pos1), Last, Defs, Uses).
collect_u(asm_op(_, _, _, [], _), _, _, _).
% Pos = cycle
% Id = Pos*4 + Slot
instruction_constraints([Op|Ops], Last, Defs, Uses, [Pos|Ps], [Id|Ids]) :-
Op = asm_op(_,_,_,_, Pos), !,
Pos #>= 0, Pos #=< Last,
Slot in 0..3,
Id #= Pos*4+Slot,
collect_d(Op, Last, Defs, Uses),
collect_u(Op, Last, Defs, Uses),
(branch_delay(Op, Delay) ->
Slot #= 0,
Pos #= Last-Delay-1
; true),
update_defuse(Op, Defs, Uses, Defs1, Uses1),
instruction_constraints(Ops, Last, Defs1, Uses1, Ps, Ids).
instruction_constraints([label(_,_,_,Pos)|Ops], Last, Defs, Uses,
[Pos|Ps], [Id|Ids]) :-
% Schedule label behind last insn
Pos #>= 0,
Pos #= Last-1,
Id #= Pos*4,
instruction_constraints(Ops, Last, Defs, Uses, Ps, Ids).
instruction_constraints([], _, _, _, [], []).
block_schedule([], []).
block_schedule(OpsReversed, _) :-
instruction_constraints(OpsReversed, Last, [], [], Ps, Ids),
length(OpsReversed, N),
Last #> N/4,
Last #< N*4+4,
all_different(Ids),
append([[Last], Ids, Ps], Constraints),
%maplist(print_dom,Constraints),
once(labeling([min, up, step, min(Last)], Constraints)),
print_schedule(OpsReversed, Last).
print_dom(Var) :-
fd_dom(Var, Dom),
write('Dom('), write(Var), write(') = '), writeln(Dom).
print_schedule(Ops, End) :-
print_schedule(0, End, Ops), !.
print_schedule(N, End, Ops) :-
N < End,
%write('Cycle '), write(N), nl,
print_cycle(N, Ops),
N1 is N + 1,
print_schedule(N1, End, Ops).
print_schedule(_, _, _).
print_defuses([X|Ds]) :-
X =.. [_,R], format(' ~w', [R]),
( Ds = [_|_] -> write(',') ; true ),
print_defuses(Ds).
print_defuses([]).
print_cycle(N, Ops) :- print_slot(0, N, Ops).
print_slot(4, _, _) :- !, format('~t~4|;;~n').
print_slot(Slot, N, Ops) :-
select(label(Name, _, _, N), Ops, Ops1)
-> format('~w:~n',[Name])
; Ops1 = Ops,
(select(asm_op(Num, OpCode, Ds, Us, N), Ops1, Ops2)
-> (format('~t~4|~w', [OpCode]),
append(Ds, Us, DUs),
print_defuses(DUs))
; (format('~t~4|nop'), Ops2 = [], Num = -1)),
format('~t~48|## insn ~d, slot ~d, orig. ~d~n', [N, Slot, Num]),
S1 is Slot + 1,
print_slot(S1, N, Ops2).
%-----------------------------------------------------------------------
% DataDependencyGraph
%-----------------------------------------------------------------------
% The DDG is actually not used by the scheduler, as it is implicitly
% modelled with the constraints. It was, however, part of the
% assignment and thus, here it is!
get_memdefs(asm_op(_, Opc, _), [def(mem)]) :- barrier(Opc), !.
get_memdefs(asm_op(_, Opc, _), [def(mem)]) :- is_store(Opc), !.
get_memdefs(_, []).
get_memuses(asm_op(_, Opc, _),
[use(mem), use('$r.0'), use('$r.1'), use('$r.2'),
use('$r.3'), use('$r.4'), use('$r.5'), use('$r.6')]) :-
barrier(Opc), !.
get_memuses(asm_op(_, Opc, _), [use(mem)]) :- is_load(Opc), !.
get_memuses(_, []).
waw_edge(Op, Defs, [Op-wav_edge(Op,Op1),wav_edge(Op,Op1)-Op1]) :-
Op = asm_op(_,_,Ds,_,_),
member(def(O), Ds),
member(def(O, Op1), Defs).
raw_edge(Op, Uses, [Op-raw_edge(Op,Op1),raw_edge(Op,Op1)-Op1]) :-
Op = asm_op(_,_,Ds,_,_),
member(def(O), Ds),
member(use(O, Op1), Uses).
war_edge(Op, Defs, [Op-war_edge(Op,Op1),war_edge(Op,Op1)-Op1]) :-
Op = asm_op(_,_,_,Us,_),
member(use(U), Us),
member(def(U, Op1), Defs).
bb_ddg(Ops, Ops1, DDG) :-
bb_ddg(Ops, [], [], [], Ops1, DDG).
bb_ddg([Op|Ops], DDG, Defs, Uses,
[Op1|Ops_O], DDG_O) :-
Op = asm_op(N, OpCode, DUs),
findall(def(O), member(def(O), DUs), Ds),
findall(use(U), member(use(U), DUs), Us),
get_memdefs(Op, MemDefs), append(MemDefs, Ds, Ds1),
get_memuses(Op, MemUses), append(MemUses, Us, Us1),
Op1= asm_op(N, OpCode, Ds1, Us1, _Pos), % Pos will be bound by the scheduler
write('## '), writeln(Op1),
add_vertices(DDG, [Op1], DDG1),
% Write-After-Write
findall(Edge1, waw_edge(Op1, Defs, Edge1), WAW_Edges),
% Read-After-Write
findall(Edge2, raw_edge(Op1, Uses, Edge2), RAW_Edges),
% Write-After-Read
findall(Edge3, war_edge(Op1, Defs, Edge3), WAR_Edges),
% Update
append([WAW_Edges, RAW_Edges, WAR_Edges], E2),
flatten(E2, Edges),
add_edges(DDG1, Edges, DDG2),
update_defuse(Op1, Defs, Uses, Defs1, Uses1), !,
% Rest
bb_ddg(Ops, DDG2, Defs1, Uses1,
Ops_O, DDG_O).
bb_ddg([Op|Ops], DDG, Defs, Uses, [Op1|Ops_O], DDG_O) :-
Op = label(N),
Op1= label(N, Defs, Uses, _Pos),
add_vertices(DDG, [Op1], DDG1),
bb_ddg(Ops, DDG1, Defs, Uses, Ops_O, DDG_O).
bb_ddg([], DDG, _, _, [], DDG).
% Assert that the free Variables are in fact one and the same
eq_pos(Pos, def(_, asm_op(_, _, _, Pos))).
eq_pos(Pos, def(_, label(_, _, _, Pos))).
eq_pos(Pos, use(_, asm_op(_, _, _, Pos))).
eq_pos(Pos, use(_, label(_, _, _, Pos))).
add_def(Op, def(O), def(O, Op)).
add_use(Op, use(U), use(U, Op)).
def_templ(def(O), def(O, _)).
update_defuse(Op, Defs, Uses, Defs1, Uses2) :-
Op = asm_op(_, _, Ds, Us, _),
%findall(def(O, Op), member(def(O), DUs), NewDefs),
%maplist(eq_pos(Pos), NewDefs), !,
maplist(add_def(Op), Ds, NewDefs),
list_to_ord_set(NewDefs, NewDefs1),
ord_union(Defs, NewDefs1, Defs1),
%findall(use(O, _), member(def(O), DUs), OldUses),
maplist(def_templ, Ds, OldUses),
delete_all(OldUses, Uses, Uses1), !,
%findall(use(U, Op), member(use(U), DUs), NewUses),
%maplist(eq_pos(Pos), NewUses), !,
maplist(add_use(Op), Us, NewUses),
list_to_ord_set(NewUses, NewUses1),
ord_union(Uses1, NewUses1, Uses2).
% Some useless List-Management, because findall/3 needs too much global memory
delete_all([X|Xs], List, List1) :-
delete_all1(X, List, List0),
delete_all(Xs, List0, List1).
delete_all([], List, List).
delete_all1(E, [X|Xs], Ys) :-
copy_term(E, E1),
E1 = X, !,
delete_all1(E, Xs, Ys).
delete_all1(E, [X|Xs], [X|Ys]) :-
delete_all1(E, Xs, Ys).
delete_all1(_, [], []).
%-----------------------------------------------------------------------
% GRAPH Printing
%-----------------------------------------------------------------------
display(N) :- write(N).
dump_graph(Method, Filename, Graph) :-
open(Filename, write, _, [alias(outstrm)]),
call(Method, outstrm, Graph),
close(outstrm).
% grahviz/2: Dump an ugraph in dotty syntax
viz_edge(Stream, Edge) :-
Edge = N1-N2,
write(Stream, '"'), with_output_to(Stream, display(N1)), write(Stream, '"'),
write(Stream, ' -> '),
write(Stream, '"'), with_output_to(Stream, display(N2)), write(Stream, '"'),
write(Stream, ';\n').
graphviz(Stream, G) :-
edges(G, E),
write(Stream, 'digraph G {\n'),
maplist(viz_edge(Stream), E),
write(Stream, '}\n').
% vcg/2: Dump an ugraph in VCG syntax
vcg_node(Stream, Node) :-
write(Stream, ' node: { '),
write(Stream, 'title: '),
write(Stream, '"'), with_output_to(Stream, display(Node)), write(Stream, '"'),
write(Stream, ' }\n').
vcg_edge(Stream, Edge) :-
Edge = N1-N2,
write(Stream, ' edge: {\n'),
write(Stream, ' sourcename: '),
write(Stream, '"'), with_output_to(Stream, display(N1)), write(Stream, '"'),
write(Stream, '\n'),
write(Stream, ' targetname: '),
write(Stream, '"'), with_output_to(Stream, display(N2)), write(Stream, '"'),
write(Stream, '\n'),
write(Stream, ' }\n').
vcg(Stream, G) :-
write(Stream, 'graph: {\n'),
vertices(G, V1),
reverse(V1, V),
maplist(vcg_node(Stream), V),
edges(G, E1),
reverse(E1, E),
maplist(vcg_edge(Stream), E),
write(Stream, '}\n').
%-----------------------------------------------------------------------
%
program_schedule(vex_program(Functions), vex_program(FunctionsS)) :-
append(Fs, [null], Functions),
maplist(function_schedule, Fs, FunctionsS).
function_schedule(function(Name, Ops),
function(Name, OpsS)) :-
write('## Scheduling function '), writeln(Name),
append(Ops1, [null], Ops),
find_basicblocks(Ops1, OpList),
maplist(bb_schedule, OpList, OpsS).
find_basicblocks([Op|Ops], [[Op]|Bbs]) :-
barrier(Op), !,
find_basicblocks(Ops, Bbs).
find_basicblocks([Op|Ops], [[Op|Bb]|Bbs]) :- !,
find_basicblocks(Ops, [Bb|Bbs]).
find_basicblocks([], [[]]).
bb_schedule(Ops, OpsS) :-
reverse(Ops, OpsR),
bb_ddg(OpsR, OpsR1, DDG),
% Debug output
retract(counter(X)), Y is X+1, assert(counter(Y)),
concat_atom(['ddg-', X, '.vcg'], '', VCGFile),
dump_graph(vcg, VCGFile, DDG),
block_schedule(OpsR1, OpsS).
main :-
assert(counter(0)),
% Read input file
open('program.pl', read, _, [alias(rstrm)]),
read_term(rstrm, Program, []),
close(rstrm),
program_schedule(Program, _).