//
//					 fir128x16x18x4.v - FIR Filter Peripheral
//
//					(C) Copyright 2008-2009 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// This module processes 2 channels of information and implements 4 filters with up
// to 127 taps each. The filters impulse responces must be symmetrical. Writing to
// output port 0 loads samples into data memory. Writing the starting address of the
// filter coefficient set to port 4 starts the filter. The results can be read on
// output port 0. When interpolating filters are implemented, multiple phases are
// present and each can be read individually on ports 0-6. 
//
//	Output Ports:
//		 15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                             Data                              | 0
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                        LSW Coefficient                        | 1
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                                                       |  MSB  | 2
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                   |    INT    |             TAPS              | 3
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                                       |  LEN  |     BASE      | 4
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                           |     Coefficient Base Address      | 5
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		}                     Reset Address Counter                     | 6
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
//		INT: interpolation factor
//		TAPS: set filter length
//		BASE: base address of data buffer
//		LEN: length of data buffer (0=32, 1=64, 2=128, 3=256)
//
//	Input Ports:
//		 15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|                             Data                              | 0-6
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		|OVF|                                                       |RDY| 7
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
//		RDY: output ready
//		OVF: accumulator overflow
//
// 107 MHz maximum clock frequency with 1 RAM, 1 multiplier and 119 slices used.
// Maximum propagation delay is in 36-bit data accumulator. Calculation of output
// values requires one clock cycle per tap plus an additional 4 clock cycles for
// pipeline delays.
//
//   Phs	Decscription
//		0	increment CCTR
//		1	read RAM
//		2	multiply
//		3	accumulate
//
// History:
//		3-3-09	latch output for 2-cycle input inst.
//		3-18-09	restart accumulator on first coefficient(s) to eliminate CPU data read delay
//		6-15-09	add saturation logic
//		6-30-09	switch data address source when counter zero instead of on writes to improve timing
//
module fir128x16x18x4 (
    input [2:0] ioaddr,	// port address
    input iocs,			// address valid
    input [15:0] din,	// input data
    input iowr,			// data valid
    output [15:0] dout,	// output data
    input clk,				// master clock
    input rst				// master reset
    );
// internal signals
wire s0,s1,s2,s3,s4,s5;	// port select signals
wire write,start,rdy;	// data valid, start convolution & convolution complete
reg rstacc;					// restart accumulator
wire rstcia;				// reset coefficient input address counter
wire accen;					// enable accumulator writes
reg [2:1] v;				// accumulator valid delay
wire load;					// load coefficient
wire cz,czd;				// coefficient counter is zero and delayed version
wire ovf,sign;				// accumulator overflow and sign bits
reg [1:0] msc;				// most significant coefficient bits
reg [8:5] base;			// buffer base address / 32
reg [1:0] length;			// buffer length
reg [2:0] intrp,dint;	// interpolation factor and interpolation counter
wire dtc;					// interpolation terminal count
reg [8:0] actr;			// address counter for loading coefficients
reg [7:0] taps;			// number of taps in filter
wire [7:0] dctr;			// data counter (address for next sample)
reg [7:0] dindx;			// data index counter (offset from last sample for read)
wire [7:0] dofs;			// offset from buffer base to sample to be read/written (dindx/dctr)
reg [7:0] cctr;			// coefficient counter (127 max.)
reg [8:0] caddr;			// coefficient address (512 possible locations)
wire [8:0] aaddr,baddr;	// port A and B addresses
wire [17:0] data,coef;	// RAM outputs and multiplier inputs
wire [35:0] prod;			// multiplier output
wire [2:0] saddr;			// shift register address (accumulators)
wire [35:0] accin,accout;	// data accumulator
wire [15:0] rnd;			// rounded result
reg [15:0] omux;			// output multiplexer
// decode I/O port addresses
assign s0 = iocs & (ioaddr == 0);
assign s1 = iocs & (ioaddr == 1);
assign s2 = iocs & (ioaddr == 2);
assign s3 = iocs & (ioaddr == 3);
assign s4 = iocs & (ioaddr == 4);
assign s5 = iocs & (ioaddr == 5);
assign s6 = iocs & (ioaddr == 6);
// configuration registers
always @ (posedge clk)
begin
	if (rst) taps <= 0;
	else if (s3 & iowr) taps <= din[7:0];		// filter taps
	if (rst) intrp <= 0;
	else if (s3 & iowr) intrp <= din[10:8];	// interpolation factor - 1
	if (rst) base <= 0;
	else if (s4 & iowr) base <= din[3:0];		// data buffer base address
	if (rst) length <= 0;
	else if (s4 & iowr) length <= din[5:4];	// data buffer length
	if (rst|rstcia) actr <= 0;
	else if (load) actr <= actr + 1;				// coef. address counter
	if (rst) msc <= 0;
	else if (s2 & iowr) msc <= din[1:0];		// MS 2 bits of coefficient
end
assign write = s0 & iowr;	// write data to port 0
assign load = s1 & iowr;	// load coefficients via port 1
assign start = s5 & iowr;	// start on write to port 5
assign rstcia = s6 & iowr;	// reset address counter
// data counter increments when new sample arrives and points at new address
// 2-16 counters are implemented in distributed RAM
RAM16X8SP ctr (
	.A(base[8:5]),	// one counter for each possible base address
	.D(dctr + 8'h01),
	.O(dctr),
	.WE(write),	// increment when writing data to buffer
	.WCLK(clk)
	);
// multiplexer selects number of bits to use from base address and counter
// counter is DCTR if not filtering and DINDX (DCTR - tap number - 1) when filtering
assign dofs = cz ? dctr : dindx;
MUX4X9 damux (
	.D0({base[8:5],dofs[4:0]}),
	.D1({base[8:6],dofs[5:0]}),
	.D2({base[8:7],dofs[6:0]}),
	.D3({base[8],dofs[7:0]}),
	.S(length),
	.Y(baddr)
	);
// coefficient counter counter (CCTR) decrements after data written and stops at zero
// coefficient address counter (CADDR) increments while CCTR is non-zero
// data interpolation counter (DINT) increments until the number of interpolation phases is reached
// data index counter (DINDX) increments when coefficients for all phases have been processed
// data address is new address - data offset
always @ (posedge clk)
begin
	if (rst) cctr <= 0;
	else if (start) cctr <= taps;		// number of taps
	else if (~cz) cctr <= cctr - 1;
	if (start) caddr <= din[8:0];		// coefficient address
	else if (~cz) caddr <= caddr + 1;
	if (start|dtc) dint <= 0;			// data interpolation phase counter
	else if (~cz) dint <= dint + 1;
	if (start) dindx <= dctr - 1;	// data address decrements when all phases generated
	else if (dtc & ~cz) dindx <= dindx - 1;
end
assign cz = ~|cctr;				// stop when 0 reached
assign dtc = (dint == intrp);	// interpolation done when interpolation factor reached
// coefficient and data RAM - 1 clock delay
// load enables writing of coefficient sets
// zero output when counting inhibited
assign aaddr = load ? actr : caddr;
RAMB16_S18_S18 dram (
	.ADDRA({1'b0,aaddr}),		// coefficient address
	.ADDRB({1'b1,baddr}),		// data address
	.DIA(din),						// 18-bit coefficient input
	.DIPA(msc),
	.DIB(din),						// 16-bit data input
	.DIPB({din[15],din[15]}),
	.DOA(coef[15:0]),				// 18-bit coefficient output
	.DOPA(coef[17:16]),
	.DOB(data[15:0]),				// 18-bit data output (3 sign bits)
	.DOPB(data[17:16]),
	.CLKA(clk),						// common clocks
	.CLKB(clk),
	.WEA(load),						// write coefficient
	.WEB(write),					// write data
	.ENA(1'b1),						// always enabled
	.ENB(1'b1),
	.SSRA(rst|cz),					// zero output when no coefficient
	.SSRB(rst|cz)
	);
// multiply 18-bit data by 18-bit coefficient -  1 clock delay
MULT18X18SIO #(
	.AREG(0),
	.BREG(0),
	.PREG(1)
	) mul (
	.A(data),
	.B(coef),
	.P(prod),
	.CEA(1'b1),	// always enabled
	.CEB(1'b1),
	.CEP(1'b1),
	.CLK(clk),
	.RSTA(rst),	// reset
	.RSTB(rst),
	.RSTP(rst)
	);
// restart accumulator on first coefficient then accumulate results
// switch shift register address to external counter when done accumulating
assign accen = ~v[2];						// modify contents only when convolving
assign saddr = accen ? intrp : ioaddr;	// length when not convolving
assign accin = prod + (rstacc ? 36'h000000000 : accout);
srl16x36e sr (
	.a({1'b0,saddr}),
	.d(accin),
	.y(accout),
	.ce(accen),
	.clk(clk)
	);
// use variable length shift register to time accumulator load
// start 2 clocks after counter is first not zero and
// end 2 clocks plus interpolation factor later
srl16x1e dly (
	.a({1'b0,intrp}),	// delay by interpolation factor
	.d(v[1]),			// but start after 2 clocks
	.y(czd),
	.ce(1'b1),
	.clk(clk)
	);
// overflow if accumulated value extends into upper 3 bits
assign sign = accout[35];
assign ovf = (sign ^ accout[34]) | (sign ^ accout[33]) | (sign ^ accout[32]);
// delay output valid to account for RAM, multiplier and accumulator delays
always @ (posedge clk)
begin
	v <= {v[1],cz};
	rstacc <= ~v[1] & czd;	// restart accumulation
end
// round result towards zero or saturate if overflow
assign rnd = (ovf ?
	{sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign,~sign}
	: accout[32:17]) 
	+ {15'b000000000000000,sign};
// connect outputs
assign rdy = v[2];	// output valid after RAM and multiplier delays
always @ (posedge clk) omux <= (ioaddr == 7) ? {ovf,14'b00000000000000,rdy} : rnd;
assign dout = omux;
endmodule
