class GCDMMIOBlackBox(val w: Int)
extends BlackBox(Map("WIDTH" -> IntParam(w)))
with HasBlackBoxResource
with HasGCDIO
{
addResource("/vsrc/GCDMMIOBlackBox.v")
}
Stack of Perl and macros in SystemVerilog ⮕ new RTL-level DSLs
Rich ecosystem of design languages enable higher productivity, but they are difficult to compose and integrate.
Full system evaluation at RTL-level.
SoC level parameterization[1] and integration of external IP.
How do we move towards the next paradigm?
We resort to performance models because writing and evaluating RTL is hard
Sum a vector with $M$, $N$-bit inputs
module MyModule
#(parameter N, M)
(
input [N-1:0] a [0:M-1],
output [N-1:0] sum
);
reg [N-1:0] tmp [0:M];
tmp[0] = a[0];
sum = tmp[M];
genvar i;
generate
for (i = 1; i < M; i = i + 1) begin
tmp[i] = tmp[i-1] + a[i];
end
endgenerate
endmodule
class MyModule(N: Int, M: Int) extends Module {
val io = IO(new Bundle {
val a = Vec(M, Input(UInt(N.W)))
val sum = Output(UInt(N.W))
})
io.sum := io.a.reduce(_ + _)
}
Leverage Scala's collection operators to make working with sequences easy
circuit MyModule :
module MyModule :
input clock : Clock
input reset : UInt[1]
output io : { flip a : UInt[4][4], sum : UInt[4]}
node _io_sum_T = add(io.a[0], io.a[1]) @[MyModule.scala 14:27]
node _io_sum_T_1 = tail(_io_sum_T, 1) @[MyModule.scala 14:27]
node _io_sum_T_2 = add(_io_sum_T_1, io.a[2]) @[MyModule.scala 14:27]
node _io_sum_T_3 = tail(_io_sum_T_2, 1) @[MyModule.scala 14:27]
node _io_sum_T_4 = add(_io_sum_T_3, io.a[3]) @[MyModule.scala 14:27]
node _io_sum_T_5 = tail(_io_sum_T_4, 1) @[MyModule.scala 14:27]
io.sum <== _io_sum_T_5 @[MyModule.scala 14:10]
Demo: Integrating IP from another design language to perform full system SoC-level evaluation
component CalyxSumBlackBox(in: 4) -> (out: 4) {
cells { ... }
wires { ... }
control { ... }
}
Receive a 4-bit input, add it three times to itself across multiple cycles, then output the result
class CalyxSumIO(nBits: Int) extends Bundle {
val clk = Input(Clock())
val reset = Input(Bool())
val in = Input(UInt(nBits.W))
val go = Input(Bool())
val out = Output(UInt(nBits.W))
val done = Output(Bool())
}
class CalyxSumBlackBox(nBits: Int)
extends BlackBox
with HasBlackBoxResource {
val io = IO(new CalyxSumIO(nBits))
addResource("/vsrc/aggregator.sv")
}
case class CalyxSumParams(
address: BigInt = 0x5000,
qDepth: Int = 4,
nBits: Int = 4,
nSum: Int = 3)
case object CalyxSumKey extends Field[Option[CalyxSumParams]](None)
class CalyxSumMMIOWrapper(
params: CalyxSumParams, beatBytes: Int
)(
implicit p: Parameters
) extends ClockSinkDomain(ClockSinkParameters())(p) {
val device = new SimpleDevice("calyx-sum", Seq("ucbbar,calyx-sum"))
val node = TLRegisterNode(Seq(AddressSet(params.address, 4096-1)),
device,
"reg/control",
beatBytes=beatBytes)
val nBits = params.nBits
val nSum = params.nSum
override lazy val module = new MMIOWrapperImpl
class MMIOWrapperImpl extends Impl with HasCalyxSumTopIO {
val io = IO(new CalyxSumTopIO)
withClockAndReset(clock, reset) {
val bb = Module(new CalyxSumBlackBox(nBits))
val in_q = Module(new Queue(UInt(nBits.W), params.qDepth))
val out_q = Module(new Queue(UInt(nBits.W), params.qDepth))
val go = RegInit(false.B)
val cnt = RegInit(0.U(8.W))
switch (go) {
is (false.B) {
when (in_q.io.count > 0.U && out_q.io.enq.ready) {
go := true.B
cnt := 0.U
}
}
is (true.B) {
when (bb.io.done) {
go := false.B
}
}
}
bb.io.clk := clock
bb.io.reset := reset.asBool
bb.io.go := go
bb.io.in := in_q.io.deq.bits
in_q.io.deq.ready := bb.io.done
out_q.io.enq.bits := bb.io.out
out_q.io.enq.valid := bb.io.done
io.done := bb.io.done
when (bb.io.done) {
assert(out_q.io.enq.ready)
}
node.regmap(
0x00 -> Seq(RegField.r(1, in_q.io.enq.ready)),
0x04 -> Seq(RegField.w(nBits, in_q.io.enq)),
0x08 -> Seq(RegField.r(1, out_q.io.deq.valid)),
0x0C -> Seq(RegField.r(nBits, out_q.io.deq))
)
}
}
}
trait CanHaveMMIOCalyxSum { this: BaseSubsystem =>
private val pbus = locateTLBusWrapper(PBUS)
val calyx_sum_done = p(CalyxSumKey) match {
case Some(params) => {
val cs = LazyModule(new CalyxSumMMIOWrapper(params, pbus.beatBytes)(p))
cs.clockNode := pbus.fixedClockNode
pbus.coupleTo("calyx_sum_mmio_wrapper") {
cs.node := TLFragmenter(pbus.beatBytes, pbus.blockBytes) := _
}
// Add port to DigitalTop (just for fun)
val calyx_sum_done = InModuleBody {
val done = IO(Output(Bool())).suggestName("calyx_sum_done")
done := cs.module.io.done
done
}
Some(calyx_sum_done)
}
case None => None
}
}
class DigitalTop(implicit p: Parameters) extends ChipyardSystem
// Enables optionally adding a Calyx generated module as a MMIO device
with chipyard.example.CanHaveMMIOCalyxSum
{
override lazy val module = new DigitalTopModule(this)
}
class WithCalyxSum extends Config((site, here, up) => {
case CalyxSumKey => Some(CalyxSumParams())
})
class CalyxSumRocketConfig extends Config(
new chipyard.example.WithCalyxSum ++
new freechips.rocketchip.subsystem.WithNBigCores(1) ++
new chipyard.config.AbstractConfig)
At this point, the SoC level configuration is finished
#define CALYX_SUM_BASE 0x5000
#define CALYX_SUM_ENQ_RDY (CALYX_SUM_BASE + 0)
#define CALYX_SUM_ENQ_BITS (CALYX_SUM_BASE + 4)
#define CALYX_SUM_DEQ_VAL (CALYX_SUM_BASE + 8)
#define CALYX_SUM_DEQ_BITS (CALYX_SUM_BASE + 12)
static inline int calyx_sum_enq_ready() {
int rdy = reg_read32(CALYX_SUM_ENQ_RDY);
printf("calyx_sum_enq_ready: %d\n", rdy);
return (rdy != 0);
}
static inline void calyx_sum_send_input(int val) {
while (!calyx_sum_enq_ready());
printf("sending input: %d\n", val);
reg_write32(CALYX_SUM_ENQ_BITS, val & 0xf);
printf("sending input done\n");
}
static inline int calyx_sum_deq_valid() {
int val = reg_read32(CALYX_SUM_DEQ_VAL);
printf("calyx_sum_deq_val: %d\n", val);
return (val != 0);
}
static inline int calyx_sum_get_output() {
while (!calyx_sum_deq_valid());
return reg_read32(CALYX_SUM_DEQ_BITS);
}
#define TEST_SIZE 3
int main() {
int test_inputs[TEST_SIZE] = {1, 2, 3};
for (int i = 0; i < TEST_SIZE; i++) {
calyx_sum_send_input(test_inputs[i]);
int out = calyx_sum_get_output();
int expect = test_inputs[i] * 3;
if (out != expect) {
printf("expect %d got %d\n", expect, out);
return 1;
}
}
printf("[*] Test success!\n");
return 0;
}
cd chipyard/tests
make
cd -
cd chipyard/sims/verilator
make -j$(nproc) run-binary CONFIG=CalyxSumRocketConfig BINARY=../../tests/calyx-sum.riscv
We still have a long way to go
Our take on design considerations for the next generation of hardware design languages and tools
object ElaborationInterop {
def compose(rtl : RTL, hls: HLS): Option[Edge]
def compose(rtl : RTL, als: ALS): Option[Edge]
}
wire [3:0] uint_val;
wire [3:0] one_hot;
wire [3:0] bit_vector;
case class Clock
case class Reset
case class PriorityMux
case class UInt(repr=Binary)
case class UInt(repr=OneHot)
case class BitVector
The source of this presentation can be found here: https://github.com/vighneshiyer/publications
FIRRTL's combinational loop detection pass
class CheckCombLoops
extends Transform
with RegisteredTransform
with DependencyAPIMigration {
...
private def getStmtDeps(
simplifiedModules: mutable.Map[String, AbstractConnMap],
deps: MutableConnMap
)(s: Statement
): Unit = s match {
case Connect(info, loc, expr) => ...
case w: DefWire => ...
case DefNode(info, name, value) =>
...
getExprDeps(deps, lhs, info)(value)
case m: DefMemory if (m.readLatency == 0) => ...
case i: WDefInstance => ...
case _ => s.foreach(getStmtDeps(simplifiedModules, deps))
}
private def run(state: CircuitState) = {
...
topoSortedModules.foreach {
...
case m: Module =>
val portSet = m.ports.map(p => LogicNode(p.name)).toSet
val internalDeps = new MutableDiGraph[LogicNode] with MutableEdgeData[LogicNode, Info]
portSet.foreach(internalDeps.addVertex(_))
m.foreach(getStmtDeps(simplifiedModuleGraphs, internalDeps))
moduleGraphs(m.name) = internalDeps
simplifiedModuleGraphs(m.name) = moduleGraphs(m.name).simplify(portSet)
// Find combinational nodes with self-edges; this is *NOT* the same as length-1 SCCs!
for (unitLoopNode <- internalDeps.getVertices.filter(v => internalDeps.getEdges(v).contains(v))) {
errors.append(new CombLoopException(m.info, m.name, Seq(unitLoopNode.name)))
}
for (scc <- internalDeps.findSCCs.filter(_.length > 1)) {
val sccSubgraph = internalDeps.subgraph(scc.toSet)
val cycle = findCycleInSCC(sccSubgraph)
(cycle.zip(cycle.tail)).foreach({ case (a, b) => require(internalDeps.getEdges(a).contains(b)) })
// Reverse to make sure LHS comes after RHS, print repeated vertex at start for legibility
val intuitiveCycle = cycle.reverse
val repeatedInitial = prettyPrintAbsoluteRef(Seq(m.name), intuitiveCycle.head)
val expandedCycle = expandInstancePaths(m.name, moduleGraphs, moduleDeps, Seq(m.name), intuitiveCycle)
errors.append(new CombLoopException(m.info, m.name, repeatedInitial +: expandedCycle))
}
case m => throwInternalError(s"Module ${m.name} has unrecognized type")
}
...
}
}
We are traversing the statements to build a graph of the nodes.
We are traversing the graph once again to check for comb loops.
case class Mux(...)
case class UIntLiteral(...)
case class SIntLiteral(...)
case class DefWire(...)
case class DefRegister(...)
case class DefInstance(...)
case class DefMemory(...)
abstract class PrimOp extends FirrtlNode
case class Mux(...)
case class UIntLiteral(...)
case class SIntLiteral(...)
case class DefWire(...)
case class DefRegister(...)
case class DefInstance(...)
case class DefMemory(...)
abstract class PrimOp extends FirrtlNode
case class OH(...)
case class PriorityMux(...)
case class BoolLiteral(...)
case class DecoupledInterface(...)
"Spec-first" and "design-first" methodologies need to meet in the middle