//
//				smac18x24x20x2.v - Pipelined Multiplier-Accumulator
//
//					(C) Copyright 2004-2009 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// This module is designed to process 2 channels of data serially and accumulate the results
// independently. It has a multiplexed 18-bit data input (dixy) and a 24-bit coefficient
// input (cixy) with a 20-bit multiplexed output and consists of an 18 x 24 bit multiplier,
// a 42-bit accumulator and a rounder. The accumulator consists of 3 pipelined
// 14-bit accumulators and has 1 extra bit for overflow during summation. The upper bit
// is discarded and the next 21 bits of the accumulator are rounded to 20 bits on the
// output. IV indicates a valid input and enables accumulation. IFV marks the final valid
// input and a delayed version writes the adder output into the 2 rounder sections and zeros
// the accumulator. This module has an 6-stage pipeline as follows.
//
// Stage	Operation
//		1	data and coefficient stored in A nd B registers in multiplier 
//		2	multiply data by coefficient and place product in P register
//		3	accumulate lower bits and save middle and upper bits
//		4	add lower partial products, accumulate middle bits and save upper bits
//		5	add upper partial products, accumulate upper bits and round lower bits
//		6	round upper bits and save lower rounded bits
// 
// 94 slices and 2 multipliers are used. The maximum clock rate is 193 MHz.
//
// Normal Warnings:
//		Signals <ppl<2:0>>, <ppl<35>> and <ppu<7:0>> are assigned but never used.
//
// History:
//		M001	1-17-07	modified for Spatran-3E by removing external data latch between RAMB 
//							and multiplier (now uses internal multiplier block input latches).
//		M002	3-9-08	Renamed and removed RAM to allow re-use in shift register based designs
//							and modified multiplier to use cascade output and input.
//		M003	3-27-08	Removed ISE 9.x IP adders
//		M004	4-26-08	Added overflow
//		M005	2-3-08	changed reset logic to keep OV low
//		M006	2-11-09	stall pipeline if iv = 0 to prevent FIFO overflow when interpolation > 15
//		M007	2-13-09	remove registers following multipliers to cut pipeline delay (229->213 MHz)
//		M008	6-6-09	add saturation logic to rounder to eliminate problems caused by clipping input
//		M009	4-26-10	saturate at +/-262143
//
module smac18x24x20x2(cixy, dixy, iv, ifv, oe, doxy, ov, ovf, clk, rst);
	 input [23:0] cixy;	// 24-bit coefficient input (stage 1)
    input [17:0] dixy;	// 18-bit data input (stage 1)
	 input iv;				// input valid and y input
	 input ifv;				// final data and coefficient input
	 input oe;				// enable output (stall if 0)
    output [19:0] doxy;	// 20-bit accumulated output
	 output ov;				// output valid and Y output
	 output ovf;			// overflow flag
    input clk;				// master clock
	 input rst;				// reset adder/accumulator
// data paths
wire [35:0] ppl,ppu;	// multiplier lower and upper partial products
// data path
reg [6:1] z;			// control signal delay
wire [31:0] ppl1;		// partial products
wire [27:0] ppu1;
reg [31:28] ppl2;		// delayed partial products
reg [27:14] ppu2;
wire [13:0] p2,p3;	// sum of partial products
wire s;					// PPL[31] = sign bit
wire [13:0] s1;		// accumulator outputs
wire [13:0] s2;
wire [13:0] s3;
reg [13:0] t1;			// accumulator buffers
reg [13:0] t2;
reg [13:0] t3;
reg [13:6] s2d;		// delayed outputs from middle accumulator (13+8=21)
wire c0,c1,c2;			// intermediate accumulator carry/borrow bits
wire sign,ovflo;		// sign and overflow flags from accumulator
wire [19:0] sum;		// sum of products from accumulator truncated to 20 bits
reg ovff;				// overflow flag
// pipeline for data valid signal - simulate 2 final inputs on reset
always @ (posedge clk)
begin
	if (oe) z <= {z[5:1],ifv};	// Zero accumulator delay - also output valid flag
end
// multiply 18-bit data by 24-bit coefficient - 2 clocks
// lower 14 coefficient bits (18 signed X 14 unsigned)
wire [17:0] cascade;	// path between multipler B ports
MULT18X18SIO #(
	.B_INPUT("DIRECT"),
	.AREG(1),
	.BREG(1),
	.PREG(1)
	) mul0 (
	.A({1'b0,cixy[13:0],3'b000}),
	.B(dixy),
	.BCOUT(cascade),
	.P(ppl),
	.CLK(clk),
	.CEA(oe),
	.CEB(oe),
	.CEP(oe),
	.RSTA(~iv),
	.RSTB(~iv),	// invalid input is zro
	.RSTP(rst)
	);
// upper 10 coefficient bits (18 signed X 10 signed)
MULT18X18SIO #(
	.B_INPUT("CASCADE"),
	.AREG(1),
	.BREG(0),
	.PREG(1)
	) mul1 (
	.A({cixy[23:14],8'h00}),
	.BCIN(cascade),
	.P(ppu),
	.CLK(clk),
	.CEA(oe),
	.CEB(1'b1),
	.CEP(oe),
	.RSTA(~iv),
	.RSTB(~iv),	// invalid input is zero
	.RSTP(rst)
	);
assign ppl1 = ppl[34:3];								// partial product delay 1
assign ppu1 = ppu[35:8];
// accumulator data pipeline delays - extra register on products cuts propogation delay 0.5 ns
always @ (posedge clk)
begin
	if (rst) ppl2 <= 0;
	else if (oe) ppl2 <= ppl1[31:28];								// partial product delay 2
	if (rst) ppu2 <= 0;
	else if (oe) ppu2 <= ppu1[27:14];
	if (rst) t3 <= 0;
	else if (oe) t3 <= (z[5]) ? 14'b00000000000000 : s3;	// MSW accumulator
	if (rst) t2 <= 0;
	else if (oe) t2 <= (z[4]) ? 14'b00000000000000 : s2;
	if (rst) t1 <= 0;
	else if (oe) t1 <= (z[3]) ? 14'b00000000000000 : s1;	// LSW accumulator
end
// add lower partial products
add14s add2 (
	.A(ppu1[13:0]),
	.B(ppl1[27:14]),
	.Q(p2),
	.CLK(clk),
	.CI(1'b0),
	.CO(c0),
	.CE(oe),
	.SCLR(rst)
	);
// add upper partial products
assign s = ppl2[31];
add14s add3 (
	.A(ppu2[27:14]),
	.B({s,s,s,s,s,s,s,s,s,s,ppl2[31:28]}),
	.Q(p3),
	.CLK(clk),
	.CI(c0),	// propogate carry
	.CO(),
	.CE(oe),
	.SCLR(rst)
	);
// accumulate products in three 14 bit steps to reduce carry propogation effects
add14s acc1 (
	.A(t1),
	.B(ppl1[13:0]),
	.Q(s1),
	.CLK(clk),
	.CI(1'b0),
	.CO(c1),
	.CE(oe),
	.SCLR(rst)
	);
// middle 14 bits added next
add14s acc2 (
	.A(t2),
	.B(p2),
	.Q(s2),
	.CLK(clk),
	.CI(c1),		// propogate carry
	.CO(c2),
	.CE(oe),
	.SCLR(rst)
	);
// upper 14 bits added last
add14s acc3 (
	.A(t3),
	.B(p3),
	.Q(s3),
	.CLK(clk),
	.CI(c2),		// propogate carry
	.CO(),
	.CE(oe),
	.SCLR(rst)
	);
// delay middle bits of accumulated sum to match upper bits (13+8=21)
always @ (posedge clk) s2d <= s2[13:6];
// truncate to 20 bits, detect positive or negative overflow of lower 20 bits and saturate output
// then round towards zero to prevent maximum positive or negative results from generating errors
assign sign = s3[13];	// most significant or sign bit
assign ovflo = (s3[13] ^ s3[12]);	// overflow
assign sum = ovflo ?
	{sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,1'b0}
	: {s3[12:0],s2d[13:7]};	// sum of products
inc20s rnd1 (
	.A(sum),
	.CI(s2d[6]),	// round up
	.CO(),
	.Q(doxy),
	.CLK(clk),
	.CE(oe),
	.SCLR(rst)
	);
assign ov = z[6] & oe;
// check for overflow
always @ (posedge clk)
begin
	if (rst) ovff <= 0;
	else if (oe & z[5]) ovff <= ovflo;
end
assign ovf = ovff;
endmodule
