//
//			sfir512x18x24x2.v - Finite Impulse Response Filter using RAM Buffer 
//
//					(C) Copyright 2004-2006 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// This is a MAC engine for I/Q FIR filters that processes 2 samples per
// instruction. SCLK is the coefficient clock and DCLK is the data clock.
// Samples are stored in a 1Kx18 RAM and instructions in a 512x36 ROM.
// This module executes 36-bit instructions formatted as follows:
//
//	  35   34   33   32          9   8     0
// +----+----+----+---------------+---------+
//	|    | E  | W  |  COEFFICIENT  |  INDEX  |
//	+----+----+----+---------------+---------+
//
//	INDEX - 9-bit offset from base address (backwards in time)
//	COEFFICIENT - 24-bit multiplier
//	W - write out accumulated result and then zero accumulator
//	E - end of filter (place one instruction before end)
//
// The base address is set when the first instruction is executed by loading
// the current sample address minus any offset. Actual addresses in data storage
// RAM are unknown. The offset may be used to adjust the sample point to the
// optimal point in the bit cell for digital demodulation. See component files
// for more information.
//
// I/O is serial X and Y samples represented as twos complement 18-bit signed
// fractions. DIXY is the input and DOXY is the output. X samples are present when
// SCLK is 1 and Y samples are present when SCLK is 0. IV and OV indicate that the
// data on DIXY and DOXY is valid. One filter tap is processed per SCLK and one
// data sample is processed per DCLK to minimize the number of multipliers required.
// The DEC input determines the number of samples to be processed in each convolution.
// OFS is the offset, in samples, from the most recent sample in the buffer to the
// first sample used in the convolution. The sample index goes backwards in time.
//
// The coefficient ROM is loaded from PDATA in 9-bit words and read in 18-bit words.
// PRST resets the coefficient address to zero and PWR increments the address on
// each write. 36-bit instructions are assembled over two accesses. This normally
// does not decrease execution speed as each instruction processes two samples.
// Time delays are set to match smac18x24x2. This adds 4 clocks to the 6 clock MAC
// time delay as follows:
//
//	Clock	Action 1										Action 2
//		1	Increment counter to even address
//		2	Read program RAM							Increment counter to odd address	
//		3	Store LSW instruction					Read Program RAM
//		4	Subtract index from base address		Store MSW instruction
//		5	multiply data by coefficient (A & B regs)
//		6	multiply data by coefficient (P reg)
//		7	accumulate lower bits and save middle and upper bits
//		8	add lower partial products, accumulate middle bits and save upper bits
//		9	add upper partial products, accumulate upper bits and round lower bits
//	  10	round upper bits and save lower rounded bits
//
// The instruction counter will stop at 512 and issue NOP (all zero) instructions.
// The instruction counter will restart at zero when sufficient data is available. 
//
// 179 slices, 2 block RAMs and 2 multiplers are used with 3x14 MAC. Maximum
// clock speeds are 128 MHz for SCLK, 197 MHz for DCLK and 252 MHz for PCLK.
//
// Normal Warnings:
//		Signals <ppl<2:0>>, <ppl<35>> and <ppu<7:0>> are assigned but never used.
//		Signals <size<3:0>> and <size1<7:0>> are assigned but never used.
//
// History:
//		M001	8-27-06	speed up reset by asserting SI during reset
//		M002	1-17-07	convert to Spartan-3E (new uctr IP)
//		M003	5-13-08	remove two coefficient option and use common 42-bit MAC
//		M004	2-11-09	add pipeline stall logic and align X/Y samples to SCLK
//		M005	2-13-09	use slower MAC unit (226 -> 213 MHz, 192 -> 171 slices)
//		M006	5-14-09	allow buffering of 256 samples in RAM
//		M007	6-6-09	add saturation logic to rounder in smac18x24x2
//		M008	4-26-10	chnage to SMAC18X24 changes max. DCLK (200->197 MHz)
//
module sfir512x18x24x2 (rfd, dixy, iv, oe, doxy, ov, ovf,
								sclk, dclk, mrst,
								dec, pdata, pclk, pwr, prst);
	 output rfd;			// ready for data
	 input [17:0] dixy;	// data input (serial X and Y samples)
	 input iv;				// input valid (X when sclk = 1 and Y when sclk = 0)
	 input oe;				// enable output (stalls pipeline if 0)
    output [17:0] doxy;	// data output (serial X and Y samples)
	 output ov;				// output valid
	 output ovf;			// overflow
    input sclk,dclk;		// 1x and 2x master clocks
	 input mrst;			// master reset
	 input [5:0] dec;		// decimation modulus minus one
	 input [8:0] pdata;	// program RAM input (in 9-bit 1/4-instruction words)
	 input pclk;			// program RAM input clock
	 input pwr;				// program RAM write enable (also increments address)
	 input prst;			// reset configuration parameter counters
//	 output [23:0] coef;	
// internal signals
reg full;					// buffer RAM full
wire ready,start;			// data ready and start convolution
reg [10:0] pctr;			// program input counter
reg [8:0] sctr;			// sample (data) input counter
wire [8:0] size,size1;	// number of outstanding samples
reg inh;						// set when coefficient counter increment inhibited
reg [9:0] cctr0;			// coefficient counter (MSB inhibits count)
wire [17:0] rom1;			// instruction ROM output
reg [8:0] indx2;			// negative offset from base
reg [17:9] temp2;			// LS coefficient
reg [8:0] base1,base2,daddr3;	// base address and data address for calculations *M002*
reg [23:0] coef3,coef4;	// coefficient bus
wire [17:0] data4;		// data from buffer RAM
reg w3,w4,e3,e4,e6;		// write data and end of filter flags
reg [4:1] r;				// reset delay
// delay reset to allow coef and data to be zeroed
always @ (posedge sclk)
begin
	r <= {r[3:1],mrst}; 
end
// coefficient input counter for loading configuration
always @ (posedge pclk)
begin
	if (prst) pctr <= 0;
	else if (pwr) pctr <= pctr + 1;
end
// sample counter and coefficient counter plus inhibit register
// SCTR generates memory address for each sample pair
// CCTR0 is address of coefficient for convolution step
always @ (posedge sclk)
begin
	full <= |size[8:4];				// full when 16 samples unprocessed
	if (mrst) sctr <= 0;				// count all samples for RAM address
	else if (iv) sctr <= sctr + 1;
	if (mrst) base1 <= 0;			// increment base by decimation factor (stored value one less)
	else if (e3) base1 <= base1 + {3'b000,dec} + 1;
	if (mrst) base2 <= 0;			// delay use for addressing
	else if (start) base2 <= base1;
	if (mrst) cctr0 <= 512;			// coefficient index resets to idle
	else if (start) cctr0 <= 0;	// coefficient index starts at zero
	else if (oe & ~cctr0[9]) cctr0 <= cctr0 + 1;	// stop at 512
	if (mrst) inh <= 1;				// zero RAM output (NOP) when CCTR0 stopped
	else inh <= ~oe;
end
// check for sufficient samples to start new convolution
assign size = sctr - base1;		// calculate number of unprocessed samples in RAM
assign size1 = sctr + (~base1);	// calculate number of unprocessed samples in RAM - 1
assign ready = oe & ~size1[8];	// proceed if positive
assign start = ready & (cctr0[9] | e4 | e6);
// signal source when 16 or more entries unprocessed
assign rfd = ~full;
// coefficient RAM - Port A = 9-bit write, Port B = 18-bit read
// reset output when program halted 
RAMB16_S9_S18 cram (
	.ADDRA(pctr),						// program input address
	.ADDRB({cctr0[8:0],~sclk}),	// coefficient output address
	.DIA(pdata[7:0]),					// program data input
	.DIPA(pdata[8]),
	.DIB(16'hFFFF),
	.DIPB(2'b11),
	.DOB({rom1[16:9],rom1[7:0]}),	// program data output
	.DOPB({rom1[17],rom1[8]}),
	.CLKA(pclk),
	.CLKB(dclk),
	.WEA(pwr),
	.WEB(1'b0),
	.ENA(1'b1),
	.ENB(1'b1),
	.SSRA(1'b0),
	.SSRB(inh|cctr0[9])   	// M001
	);
// instruction register to hide RAM output delay
// index and half of coefficient read first
// lower half coefficient delayed
always @ (posedge sclk)		// cycle 2
begin
	if (mrst) indx2 <= 9'b000000000;
	else indx2 <= rom1[8:0];		// data index
	if (mrst) temp2 <= 9'b000000000;
	else temp2 <= rom1[17:9];	// LS coefficient
end
always @ (negedge sclk)	// cycle 3
begin
	if (mrst) coef3 <= 24'h000000;
	else coef3 <= {rom1[14:0],temp2};	// full cofficient
	if (mrst) w3 <= 0;
	else w3 <= rom1[15];						// write sum of products to output
	if (mrst) e3 <= 0;
	else e3 <= rom1[16];						// end of filter
	daddr3 <= base2 - indx2;				// BASE is at least 1 less than SCTR
end
// coefficient and control delays
always @ (posedge sclk)		// cycle 4
begin
	coef4 <= coef3;		// delay to compensate for data RAM access
	w4 <= w3;				// write flag delay
	e4 <= e3;				// end flag delay
	if (mrst) e6 <= 0;	// latch end flag if not ready yet
	else e6 <= (e4|e6) & ~ready;
end
//	1Kx18 Buffer RAM for data samples - write port A - read port B.
RAMB16_S18_S18 dram (
	.ADDRA({sctr,~sclk}),	// data input address
	.ADDRB({daddr3,sclk}),	// data output address
	.DIA(dixy[15:0]),			// data input
	.DIPA(dixy[17:16]),
	.DIB(16'hFFFF),
	.DIPB(2'b11),
	.DOB(data4[15:0]),		// data output
	.DOPB(data4[17:16]),
	.CLKA(dclk),				// common clocks
	.CLKB(dclk),
	.WEA(iv),					// write data
	.WEB(1'b0),
	.ENA(1'b1),
	.ENB(1'b1),
	.SSRA(1'b0),
	.SSRB(1'b0)
	);
// Serial multiplier-accumulator for rectangular (I/Q) data
smac18x24x2 mac (
	.cixy(coef4), 
	.dixy(data4), 
	.iv(1'b1),
	.ifv(w4),
	.oe(1'b1),
	.doxy(doxy),
	.ov(ov), 
	.ovf(ovf),	
	.clk(dclk), 
	.rst(r[4])	// delayed reset
	);
endmodule
