//
//					CIC4H56.v - 4-stage Cascaded Integrator Comb Filter
//
//					  (C) Copyright 2007-2010 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// This module implements a switchable CIC interpolator or CIC decimator. Interpolation
// and decimation by 8-720 is possible. RDI and TDO are the input from the mixer and
// output to the mixer for receiving and transmitting, respectively. Data is sampled
// synchronous to SCLK and multiplexed at the DCLK rate. RDO and TDI are decimated data
// for receiving and input data for interpolation for transmitting, respectively. TIE
// requests new data on TDI. One X or Y sample is present on each DCLK cycle for TDO.
// SCLK high indicates X and SCLK low indicates Y.
//
// Four 56-bit integrators and a 4-stage 28-bit comb filter are used in the receive path.
// Filter gain is the decimation factor to the fourth power, so filter gain is 10K to 160G
// for decimation factors from 10-640. 38 extra bits are provided and full-scale output
// can be reached by adjusting the input gain to 256G/(filter gain). This results in gain
// settings from 25M to 1.6 for decimation by 10-640.
//
// A 4-stage 28-bit comb filter and four 56-bit integrators are used in the transmit path.
// Filter gain is the decimation factor to the third power, so filter gain is 1000 to 256M
// for decimation factors from 10-640. 28 extra bits are provided and full-scale output
// can be reached by adjusting the input gain to 256M/(filter gain). This results in gain
// settings from 256K to 1 for interpolation by 10-640. The multiplier gain must be less
// than 64 to allow for 4 bits of growth in the differentiator section.
//
// There are 2 configuration registers. One controls gain with 10 bits of integer gain and
// 6 bits of fractional gain. The other contains 4-bits of binary exponent and the 10-bit
// decimation/interpolation factor. The multiplier gain should be between 1 and 1024 for
// receiving and be between 1 and 64 for transmitting. The shift value is 0-15 for reception
// and 0-12 for transmission. Thus, an 18-bit input can be adjusted to use the lower 18 to
// 43 bits of the 56-bit accumulator input. Decimation factors from 10 to 640 may be used.
//
// Note that several components must be instantiated instead of inferred to prevent
// optimization that can slow operation and/or disrupt I/O timing. 
//
// 521 slices and 1 are multiplier used. 198 MHz maximum DCLK rate. 169 MHz maximum SCLK
// rate. 16 DCLK delay. SCLK is 1/2 DCLK frequency with positive-going edges aligned.
//
// Normal Warnings:
//		<prod<35:34>> and <prod<5:0>> are assigned but never used.
//
// History:
//		1-11-10	create from CIC4H (48-bit design)
//
module CIC4H56(
    input [17:0] rdi,			// receiver data input - serial X and Y
	 input [17:0] tdix,tdiy,	// tranmitter data input - parallel X and Y
    output tie,					// get new transmitter input
    output [17:0] rdo,tdo,		// data outputs - serial X and Y
    output rov,					// valid receiver output
    input sclk,dclk,				// sample clock and data clock
	 input rst,						// master reset (sync. to SCLK)
	 input xmt,						// 0=receive (RDI->RDO), 1=transmit (TDI->TDO)
	 input [15:0] cin,			// gain and decimation set (sync. to DCLK)
	 input [1:0] cwr				// write configuration word (0=gain, 1=dec/int)
    );
// internal signals
reg [9:0] n;			// decimation/interpolation factor
wire [17:0] mux0;		// multiplier input multiplexer
wire [27:0] mux1;		// intergrator input multiplexer
wire [35:0] prod;		// multiplier output
reg [3:0] g;			// gain exponent
reg [29:0] shift1;	// shift 0,1,2,3 bits
reg [41:0] shift4;	// shift 0,4,8,12 bits
reg s1,s4;				// sign bits
wire [27:0] acc0l;	// LS 28 bits from shifter
reg [14:0] acc0h;		// delayed MS 15 bits from shifter
wire s;					// 13 sign bits
wire [27:0] acc1ah,acc2ah,acc3ah,acc4ah,acc1al,acc2al,acc3al,acc4al;	// integrator adders
reg [27:0] acc1bh,acc2bh,acc3bh,acc4bh,acc1bl,acc2bl,acc3bl,acc4bl;	// chan. 2 storage
wire c1,c2,c3,c4;
wire [27:0] dif0;			// comb filter input
reg [27:0] dly1a,dly1b,dly2a,dly2b,dly3a,dly3b,dly4a,dly4b;	// comb filter storage
reg [27:0] dif1,dif2,dif3,dif4;										// comb filter subtractors
reg [9:0] i;				// sample counter
wire tc;						// terminal count
reg e1,e2,e3,e4,e5,e6;	// differentiator data valid delay
// gain control and decimation/interpolation configuration registers
always @ (posedge dclk)
begin
	if (cwr[1]) g <= cin[15:12];	// gain binary exponent
	if (cwr[1]) n <= cin[9:0];		// decimation/interpolation factor
end
// Multiplier input multiplexer - multiplex transmittter input
MUX4X18S mmux (
	.D0(rdi),	// already multiplexed with X/Y while SCLK high/low
	.D1(rdi),
	.D2(tdiy),	// data following negative-going edge of SCLK
	.D3(tdix),	// data following positive-going edge of SCLK
	.S({xmt,sclk}),
	.Q(mux0),
	.CE(1'b1),
	.CLK(dclk),
	.RST(1'b0)
	);
// Gain control - 2 clock delay
// I/O synchronized to negative-going edges of SCLK
MULT18X18SIO mult (
	.A(mux0),		// multiplexed transmit or receive data
	.B({2'b00,cin}),
	.P(prod),
	.CLK(dclk),
	.CEA(1'b1),
	.CEB(cwr[0]),	// write gain mantissa to B register
	.CEP(1'b1),
	.RSTA(rst),		// reset input A register
	.RSTB(1'b0),	// don't reset gain register
	.RSTP(rst)		// reset output register
	);
// Integrator input multiplexer - 1 clock delay
// zero output between samples on transmit
// select differentiator output (xmt) or multiplier output (rcv)
// output synchronized to positive-going edges of SCLK due to even number of registers
// Instantiate instead of inferring to prevent optimization
MUX2X28S imux (
	.D0(prod[33:6]),	// receive mult = 0-1024
	.D1(dif4),			// transmit
	.S(xmt),
	.Q(mux1),
	.CLK(dclk),
	.CE(1'b1),
	.RST(xmt & ~e6)
	);
// Integrator input shifter - 2 clock delay.
// first multiplexer shifts in 1 bit increments and the second in 4 bit increments.
// The most significant bits are then delayed by 1 clock.
always @ (posedge dclk)
begin
	case (g[1:0])
	2'b00: shift1 <= {mux1[27],mux1[27],mux1[27],mux1[26:0]};
	2'b01: shift1 <= {mux1[27],mux1[27],mux1[26:0],1'b0};
	2'b10: shift1 <= {mux1[27],mux1[26:0],2'b00};
	2'b11: shift1 <= {mux1[26:0],3'b00};
	default: shift1 <= 30'hxxxxxxxx;
	endcase
	s1 <= mux1[27];
	case (g[3:2])
	2'b00: shift4 <= {s1,s1,s1,s1,s1,s1,s1,s1,s1,s1,s1,s1,shift1};
	2'b01: shift4 <= {s1,s1,s1,s1,s1,s1,s1,s1,shift1,4'h0};
	2'b10: shift4 <= {s1,s1,s1,s1,shift1,8'h00};
	2'b11: shift4 <= {shift1,12'h000};
	default: shift4 <= 42'hxxxxxxxxxxx;
	endcase
	s4 <= s1;
	acc0h <= {s4,shift4[41:28]};	// MS 15 bits
end
assign acc0l = shift4[27:0];	// LS 28 bits
assign s = acc0h[14];			// 13 extra sign bits
// 4 pipelined integrators with dual accumulators
// Integrator 1
add28s add1l (
	.A(acc0l),	// input
	.B(acc1bl),
	.CI(1'b0),
	.CO(c1),
	.Q(acc1al),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
add28s add1h (
	.A({s,s,s,s,s,s,s,s,s,s,s,s,s,acc0h}),
	.B(acc1bh),
	.CI(c1),
	.CO(),
	.Q(acc1ah),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
// Integrator 2
add28s add2l (
	.A(acc1al),
	.B(acc2bl),
	.CI(1'b0),
	.CO(c2),
	.Q(acc2al),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
add28s add2h (
	.A(acc1ah),
	.B(acc2bh),
	.CI(c2),
	.CO(),
	.Q(acc2ah),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
// Integrator 3
add28s add3l (
	.A(acc2al),
	.B(acc3bl),
	.CI(1'b0),
	.CO(c3),
	.Q(acc3al),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
add28s add3h (
	.A(acc2ah),
	.B(acc3bh),
	.CI(c3),
	.CO(),
	.Q(acc3ah),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
// Integrator 4
add28s add4l (
	.A(acc3al),
	.B(acc4bl),
	.CI(1'b0),
	.CO(c4),
	.Q(acc4al),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
add28s add4h (
	.A(acc3ah),
	.B(acc4bh),
	.CI(c4),
	.CO(),
	.Q(acc4ah),
	.CLK(dclk),
	.CE(1'b1),
	.SCLR(rst)
	);
// 2nd channel of each accumulator
always @ (posedge dclk)
begin
	if (rst) acc1bl <= 0;
	else acc1bl <= acc1al;
	if (rst) acc2bl <= 0;
	else acc2bl <= acc2al;
	if (rst) acc3bl <= 0;
	else acc3bl <= acc3al;
	if (rst) acc4bl <= 0;
	else acc4bl <= acc4al;
	if (rst) acc1bh <= 0;
	else acc1bh <= acc1ah;
	if (rst) acc2bh <= 0;
	else acc2bh <= acc2ah;
	if (rst) acc3bh <= 0;
	else acc3bh <= acc3ah;
	if (rst) acc4bh <= 0;
	else acc4bh <= acc4ah;
end
// sample counter and output valid delay
always @ (posedge sclk)
begin
	if (rst|tc) i <= 0;	// interpolation/decimation counter
	else i <= i + 1;
end
assign tc = (i == n);
// delay terminal count to enable differentiators
// even-numbered taps synchronized to SCLK positive-going edge
// odd-numbered taps synchronized to SCLK negative-going edge
always @ (posedge dclk)
begin
	e1 <= tc;
	e2 <= e1;
	e3 <= e2;
	e4 <= e3;
	e5 <= e4;
	e6 <= e5;
end
assign tie = e2;	// also use to request transmitter input samples
// Differentiator input multiplexer
// Instantiate instead of inferring to prevent delay-increasing optimization
// output synchronized to SCLK due to even number of registers following input port
MUX2X28S dmux (
	.D0(acc4ah),		// receive: integrator output
	.D1(prod[33:6]),	// transmit: multipler = 0-1024
	.S(xmt),
	.Q(dif0),
	.CLK(dclk),
	.CE(1'b1),
	.RST(1'b0)
	);
// Comb Filter
// 4 differentiators with 2 clock delay on negative input to support 2 channels
always @ (posedge dclk)
begin
	if (rst) dif1 <= 0;
	else if (e2) dif1 <= dif0 - dly1b;
	if (rst|e2) dly1a <= dif0;
	if (rst|e2) dly1b <= dly1a;
	if (rst) dif2 <= 0;
	else if (e3) dif2 <= dif1 - dly2b;
	if (rst|e3) dly2a <= dif1;
	if (rst|e3) dly2b <= dly2a;
	if (rst) dif3 <= 0;
	else if (e4) dif3 <= dif2 - dly3b;
	if (rst|e4) dly3a <= dif2;
	if (rst|e4) dly3b <= dly3a;
	if (rst) dif4 <= 0;
	else if (e5) dif4 <= dif3 - dly4b;
	if (rst|e5) dly4a <= dif3;
	if (rst|e5) dly4b <= dly4a;
end
// connect outputs
assign tdo = acc4ah[17:0];	// drop upper 10 bits
assign rdo = dif4[27:10];	// drop lower 10 bits
assign rov = e6;
endmodule
