Adrian Prantl
TU Vienna

Optimal Instruction Scheduling

An optimal instruction scheduler for VLIW architectures

For this course, inspired by this paper I wrote an optimal instruction scheduler for the VEX processor family using the great clpfd Prolog library.


Sources are here: sched.pl!

    

#!/usr/bin/pl -q -t main -f
% -*- prolog -*-

% sched.pl -- an optimal instruction scheduler for the 4xVEX
%
% Copyright (C) June 2008, Adrian Prantl
%
%   This program is free software: you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation, either version 3 of the License, or
%   (at your option) any later version.
%
%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You should have received a copy of the GNU General Public License
%   along with this program.  If not, see <http://www.gnu.org/licenses/>


:- use_module(library(ugraphs)),
   use_module(library(clpfd)).

%-----------------------------------------------------------------------
% Basic Block Schedule
%-----------------------------------------------------------------------

% Machine description
opcode_latency(mpyll, 3).
opcode_latency(Load, 3) :- is_load(Load), !.
opcode_latency(Store, 1) :- is_store(Store), !.
opcode_latency(_, 1).

branch_delay(asm_op(_, return, _, _, _), 4).
branch_delay(asm_op(_, goto, _, _, _), 4).
branch_delay(asm_op(_, br, _, _, _), 4).
branch_delay(asm_op(_, brf, _, _, _), 4).

barrier(label(_)).
barrier(asm_op(_, call, _)).
barrier(asm_op(_, return, _)).
barrier(asm_op(_, goto, _)).
barrier(asm_op(_, br, _)).
barrier(asm_op(_, brf, _)).

is_load('ldb').
is_load('ldb.d').
is_load('ldbu').
is_load('ldbu.d').
is_load('ldh').
is_load('ldh.d').
is_load('ldhu').
is_load('ldhu.d').
is_load('ldw').
is_load('ldw.d').

is_load('ldb.s').
is_load('ldb.s.d').
is_load('ldb.su').
is_load('ldb.su.d').
is_load('ldh.s').
is_load('ldh.s.d').
is_load('ldhu.s').
is_load('ldhu.s.d').
is_load('ldw.s').
is_load('ldw.s.d').

is_store(pft).
is_store(stb).
is_store(sth).
is_store(stw).
is_store('pft.s').
is_store('stb.s').
is_store('sth.s').
is_store('stw.s').

waw_constr(asm_op(_,OpCode,_,_,Pos1), def(O), def(O,asm_op(_,_,_,_,Pos2))) :- !,
  opcode_latency(OpCode, Latency),
  Pos2 #>= Pos1 + Latency.
waw_constr(_,_,_).

raw_constr(asm_op(_,OpCode,_,_,Pos1), def(O), use(O,asm_op(_,_,_,_,Pos2))) :- !,
  opcode_latency(OpCode, Latency),
  Pos2 #>= Pos1 + Latency.
raw_constr(_,_,_).

war_constr(asm_op(_,OpCode,_,_,Pos1), use(U), def(U,asm_op(_,_,_,_,Pos2))) :- !,
  opcode_latency(OpCode, Latency),
  Pos2 #>= Pos1 + Latency - 1.
war_constr(_,_,_).


collect_d(Op, Last, Defs, Uses) :-
  Op = asm_op(N, OpCode, [D|Ds], Us, Pos1),
  maplist(waw_constr(Op, D), Defs),
  maplist(raw_constr(Op, D), [use(_,asm_op(_,_,_,_,Last))|Uses]),
  collect_d(asm_op(N, OpCode, Ds, Us, Pos1), Last, Defs, Uses).
collect_d(asm_op(_, _, [], _, _), _, _, _).

collect_u(Op, Last, Defs, Uses) :-
  Op = asm_op(N, OpCode, Ds, [U|Us], Pos1),
  maplist(war_constr(Op, U), Defs),
  collect_d(asm_op(N, OpCode, Ds, Us, Pos1), Last, Defs, Uses).
collect_u(asm_op(_, _, _, [], _), _, _, _).


% Pos = cycle
% Id = Pos*4 + Slot
instruction_constraints([Op|Ops], Last, Defs, Uses, [Pos|Ps], [Id|Ids]) :-
  Op = asm_op(_,_,_,_, Pos), !,
  Pos #>= 0, Pos #=< Last,
  Slot in 0..3,
  Id #= Pos*4+Slot,
  collect_d(Op, Last, Defs, Uses),
  collect_u(Op, Last, Defs, Uses),

  (branch_delay(Op, Delay) ->
   Slot #= 0,
   Pos #= Last-Delay-1
  ; true),

  update_defuse(Op, Defs, Uses, Defs1, Uses1),
  instruction_constraints(Ops, Last, Defs1, Uses1, Ps, Ids).

instruction_constraints([label(_,_,_,Pos)|Ops], Last, Defs, Uses,
                        [Pos|Ps], [Id|Ids]) :-
  % Schedule label behind last insn
  Pos #>= 0,
  Pos #= Last-1,
  Id #= Pos*4,
  instruction_constraints(Ops, Last, Defs, Uses, Ps, Ids).

instruction_constraints([], _, _, _, [], []).

block_schedule([], []).
block_schedule(OpsReversed, _) :-
  instruction_constraints(OpsReversed, Last, [], [], Ps, Ids),
  length(OpsReversed, N),
  Last #> N/4,
  Last #< N*4+4,
  all_different(Ids),
  append([[Last], Ids, Ps], Constraints),
  %maplist(print_dom,Constraints),
  once(labeling([min, up, step, min(Last)], Constraints)),
  print_schedule(OpsReversed, Last).

print_dom(Var) :-
  fd_dom(Var, Dom),
  write('Dom('), write(Var), write(') = '), writeln(Dom).

print_schedule(Ops, End) :-
  print_schedule(0, End, Ops), !.

print_schedule(N, End, Ops) :-
  N < End,
  %write('Cycle '), write(N), nl,
  print_cycle(N, Ops),
  N1 is N + 1,
  print_schedule(N1, End, Ops).
print_schedule(_, _, _).

print_defuses([X|Ds]) :-
  X =.. [_,R], format(' ~w', [R]),
  ( Ds = [_|_] -> write(',') ; true ),
  print_defuses(Ds).
print_defuses([]).

print_cycle(N, Ops) :- print_slot(0, N, Ops).
print_slot(4, _, _) :- !, format('~t~4|;;~n').
print_slot(Slot, N, Ops) :-
  select(label(Name, _, _, N), Ops, Ops1)
  -> format('~w:~n',[Name])
  ;  Ops1 = Ops,
  
  (select(asm_op(Num, OpCode, Ds, Us, N), Ops1, Ops2)
  -> (format('~t~4|~w', [OpCode]),
      append(Ds, Us, DUs),
      print_defuses(DUs))
  ; (format('~t~4|nop'), Ops2 = [], Num = -1)),

  format('~t~48|## insn ~d, slot ~d, orig. ~d~n', [N, Slot, Num]),
  
  S1 is Slot + 1,
  print_slot(S1, N, Ops2).


%-----------------------------------------------------------------------
% DataDependencyGraph
%-----------------------------------------------------------------------

% The DDG is actually not used by the scheduler, as it is implicitly
% modelled with the constraints. It was, however, part of the
% assignment and thus, here it is!

get_memdefs(asm_op(_, Opc, _), [def(mem)]) :- barrier(Opc), !.
get_memdefs(asm_op(_, Opc, _), [def(mem)]) :- is_store(Opc), !.
get_memdefs(_, []).

get_memuses(asm_op(_, Opc, _),
            [use(mem), use('$r.0'), use('$r.1'), use('$r.2'),
             use('$r.3'), use('$r.4'), use('$r.5'), use('$r.6')]) :-
  barrier(Opc), !.

get_memuses(asm_op(_, Opc, _), [use(mem)]) :- is_load(Opc), !.
get_memuses(_, []).

waw_edge(Op, Defs, [Op-wav_edge(Op,Op1),wav_edge(Op,Op1)-Op1]) :-
  Op = asm_op(_,_,Ds,_,_),
  member(def(O), Ds),
  member(def(O, Op1), Defs).

raw_edge(Op, Uses, [Op-raw_edge(Op,Op1),raw_edge(Op,Op1)-Op1]) :-
  Op = asm_op(_,_,Ds,_,_),
  member(def(O), Ds),
  member(use(O, Op1), Uses).

war_edge(Op, Defs, [Op-war_edge(Op,Op1),war_edge(Op,Op1)-Op1]) :-
  Op = asm_op(_,_,_,Us,_),
  member(use(U), Us),
  member(def(U, Op1), Defs).

bb_ddg(Ops, Ops1, DDG) :-
  bb_ddg(Ops, [], [], [], Ops1, DDG).

bb_ddg([Op|Ops], DDG, Defs, Uses,
       [Op1|Ops_O], DDG_O) :-
  Op = asm_op(N, OpCode, DUs),
  findall(def(O), member(def(O), DUs), Ds),
  findall(use(U), member(use(U), DUs), Us),
  get_memdefs(Op, MemDefs), append(MemDefs, Ds, Ds1),
  get_memuses(Op, MemUses), append(MemUses, Us, Us1),
  Op1= asm_op(N, OpCode, Ds1, Us1, _Pos),  % Pos will be bound by the scheduler
  write('## '), writeln(Op1),

  add_vertices(DDG, [Op1], DDG1),

  % Write-After-Write
  findall(Edge1, waw_edge(Op1, Defs, Edge1), WAW_Edges),

  % Read-After-Write
  findall(Edge2, raw_edge(Op1, Uses, Edge2), RAW_Edges),

  % Write-After-Read
  findall(Edge3, war_edge(Op1, Defs, Edge3), WAR_Edges),
  
  % Update
  append([WAW_Edges, RAW_Edges, WAR_Edges], E2),
  flatten(E2, Edges),
  add_edges(DDG1, Edges, DDG2),

  update_defuse(Op1, Defs, Uses, Defs1, Uses1), !,

  % Rest
  bb_ddg(Ops, DDG2, Defs1, Uses1,
         Ops_O, DDG_O).

bb_ddg([Op|Ops], DDG, Defs, Uses, [Op1|Ops_O], DDG_O) :-
  Op = label(N),
  Op1= label(N, Defs, Uses, _Pos),
  add_vertices(DDG, [Op1], DDG1),
  bb_ddg(Ops, DDG1, Defs, Uses, Ops_O, DDG_O).

bb_ddg([], DDG, _, _, [], DDG).

% Assert that the free Variables are in fact one and the same
eq_pos(Pos, def(_, asm_op(_, _, _, Pos))).
eq_pos(Pos, def(_,  label(_, _, _, Pos))).
eq_pos(Pos, use(_, asm_op(_, _, _, Pos))).
eq_pos(Pos, use(_,  label(_, _, _, Pos))).

add_def(Op, def(O), def(O, Op)).
add_use(Op, use(U), use(U, Op)).
def_templ(def(O), def(O, _)).

update_defuse(Op, Defs, Uses, Defs1, Uses2) :-
  Op = asm_op(_, _, Ds, Us, _),
  %findall(def(O, Op), member(def(O), DUs), NewDefs),
  %maplist(eq_pos(Pos), NewDefs), !,
  maplist(add_def(Op), Ds, NewDefs),
  list_to_ord_set(NewDefs, NewDefs1),
  ord_union(Defs, NewDefs1, Defs1),

  %findall(use(O, _), member(def(O), DUs), OldUses),
  maplist(def_templ, Ds, OldUses),  
  delete_all(OldUses, Uses, Uses1), !,

  %findall(use(U, Op), member(use(U), DUs), NewUses),
  %maplist(eq_pos(Pos), NewUses), !,
  maplist(add_use(Op), Us, NewUses),
  list_to_ord_set(NewUses, NewUses1),
  ord_union(Uses1, NewUses1, Uses2).


% Some useless List-Management, because findall/3 needs too much global memory
delete_all([X|Xs], List, List1) :-
  delete_all1(X, List, List0),
  delete_all(Xs, List0, List1).
delete_all([], List, List).

delete_all1(E, [X|Xs], Ys) :-
  copy_term(E, E1),
  E1 = X, !,
  delete_all1(E, Xs, Ys).
delete_all1(E, [X|Xs], [X|Ys]) :-
  delete_all1(E, Xs, Ys).
delete_all1(_, [], []).

%-----------------------------------------------------------------------
% GRAPH Printing
%-----------------------------------------------------------------------

display(N) :- write(N).

dump_graph(Method, Filename, Graph) :-  
  open(Filename, write, _, [alias(outstrm)]),
  call(Method, outstrm, Graph),
  close(outstrm).

% grahviz/2: Dump an ugraph in dotty syntax
viz_edge(Stream, Edge) :-
  Edge = N1-N2,
  write(Stream, '"'), with_output_to(Stream, display(N1)), write(Stream, '"'),
  write(Stream, ' -> '),
  write(Stream, '"'), with_output_to(Stream, display(N2)), write(Stream, '"'),
  write(Stream, ';\n').

graphviz(Stream, G) :-
  edges(G, E),
  write(Stream, 'digraph G {\n'),
  maplist(viz_edge(Stream), E),
  write(Stream, '}\n').

% vcg/2: Dump an ugraph in VCG syntax

vcg_node(Stream, Node) :-
  write(Stream, '  node: { '),
  write(Stream, 'title: '),
  write(Stream, '"'), with_output_to(Stream, display(Node)), write(Stream, '"'),
  write(Stream, ' }\n').

vcg_edge(Stream, Edge) :-
  Edge = N1-N2,
  write(Stream, '  edge: {\n'),
  write(Stream, '    sourcename: '),
  write(Stream, '"'), with_output_to(Stream, display(N1)), write(Stream, '"'),
  write(Stream, '\n'),
  write(Stream, '    targetname: '),
  write(Stream, '"'), with_output_to(Stream, display(N2)), write(Stream, '"'),
  write(Stream, '\n'),
  write(Stream, '  }\n').


vcg(Stream, G) :-
  write(Stream, 'graph: {\n'),
  vertices(G, V1),
  reverse(V1, V),
  maplist(vcg_node(Stream), V),
  edges(G, E1),
  reverse(E1, E),
  maplist(vcg_edge(Stream), E),
  write(Stream, '}\n').

%-----------------------------------------------------------------------

%

program_schedule(vex_program(Functions), vex_program(FunctionsS)) :-
  append(Fs, [null], Functions),
  maplist(function_schedule, Fs, FunctionsS).

function_schedule(function(Name, Ops),
                  function(Name, OpsS)) :-
  write('## Scheduling function '), writeln(Name),
  append(Ops1, [null], Ops),
  find_basicblocks(Ops1, OpList),
  maplist(bb_schedule, OpList, OpsS).

find_basicblocks([Op|Ops], [[Op]|Bbs]) :-
  barrier(Op), !,
  find_basicblocks(Ops, Bbs).
find_basicblocks([Op|Ops], [[Op|Bb]|Bbs]) :- !,
  find_basicblocks(Ops, [Bb|Bbs]).
find_basicblocks([], [[]]).

bb_schedule(Ops, OpsS) :-
  reverse(Ops, OpsR),
  bb_ddg(OpsR, OpsR1, DDG),

  % Debug output
  retract(counter(X)), Y is X+1, assert(counter(Y)),
  concat_atom(['ddg-', X, '.vcg'], '', VCGFile),
  dump_graph(vcg, VCGFile, DDG),
  block_schedule(OpsR1, OpsS).

main :-
  assert(counter(0)),
  % Read input file
  open('program.pl', read, _, [alias(rstrm)]),
  read_term(rstrm, Program, []),
  close(rstrm),
  program_schedule(Program, _).


Complang Group
Adrian Prantl
   Scheduling
Sitemap
Faculty of Informatics
Vienna University of Technology
top | HTML 4.01 | last update: 2014-06-07 (Adrian)