//
//		dft256x16.v - 32-256 point FFT/IFFT with Buffer and Control Logic
//
//					(C) Copyright 2004-2010 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// The interface to the FFT/IFFT engine is via a dual-port shared RAM. The I/O address
// is incremented after each read or write. The receiver input (RDIX & RDIY) and
// transmitter output (TDOX & TDOY) are 32-bits wide and carry rectangular data. RIV
// initiates a write during reception and TOE initiates a read during transmission.
// TOV indicates that the data is valid and the buffer is not empty.
//
//	Transmitter input and receiver output are bufferred in a second shared RAM that can be
// directly accessed by the CPU. The transmitter input (TDIM & TDIP) consists of a 3-bit
// magnitude (off/-36/.../-6/0dBFS) and 4-bit phase (22.5-degree increments). The receiver
// output consists of an 7-bit signed logarithmic magnitude with a range of +/-96 dB and
// a 5-bit phase with a range of +/-Pi. The RAM is organized into 4 buffers that are used
// sequentially and provides 256 words for each FFT result or IFFT input.
//
// FFT configuration data is mapped onto a 16-bit data bus as 3 registers. FFT Size is the FFT
// or IFFT size in powers of two. FFT length is the number of samples-1 to process in the FFT.
// The cyclic prefix (CP) length is the size of the inter-symbol guard interval. RUN initiates
// the FFT or IFFT and RST reset the base address register to 0. The fourth register is used
// to enable transmission of each symbol. This prevents spurs if the host processor stalls. 
//
//	    15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//	   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  0 |FWD|                                       | 0 |    FFT Size   | (Resets FFT engine)
//	   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  1 |   |         CP Length         |         FFT Length - 1        |
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//	 2 |                                               |SYN|SOF|RST|RUN|
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  3 |                                                               | Transmit Symbol
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  4 |   7   |   6   |   5   |   4   |   3   |   2   |   1   |   0   | Scaling Factors (0-3 right shifts)
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
//		FFT Size - number of samples in FFT/IFFT as power of 2 (5=32, 6=64, 7=128, 8=256)
//		FFT Length - integer number of samples in FFT/IFFT
//		FWD - 0 = IFFT, 1 = FFT
//		SYN - enable FFT sample synchronization based on phase correlation (sync input)
//		SOF - enable FFT start of frame on null symbol detection (SOF input)
//		RST - reset base address
//		RUN - manually enable FFT/IFFT (resets address counter to 0 when reset)
// 
// There are 3 control signals that enter and leave the module. XMT puts the module in transmit
// mode. If enabled, SYNC sets the address of the first sample and initiates the FFT. DONE goes
// active for one clock period when the FFT or IFFT is complete and all data in unloaded.
//
// Two status registers are provided. The first provides the current base address and the second
// provides status bits:
//
//	    15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//	   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  0 | 0   0   1   0   0   0 | Base  | 0   0   0   0   0   0   0   0 | Current Buffer Address
//	   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//  1 |INT|BSY|EOF|RUN|SOF| 0   0 |            Sync. Phase            | Status
//	   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
//		INT - FFT/IFFT complete interrupt (reset when read)
//		BSY - FFT engine busy
//		EOF - end of frame (DCD off at end of sampling period)
//		RUN - sampling data
//		SOF - start of frame (null symbol detected)
//
// The subcarrier data buffers are formatted so that the DC subcarrier is in the center of the
// buffer by inverting the MSB of the address from the FFT module. The following example shows
// the format for a 32-point FFT with 12 LSB and 12 USB subcarriers used for data transmission:
//
//     1 1 1 1 1 1 1                 -   +                 1 1 1 1 1 1
//     6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 Subcarrier
//    +---------------------------------------------------------------+
//    |0 0 0 0 D D D D D D D D D D D D C D D D D D D D D D D D D 0 0 0|
//    +---------------------------------------------------------------+
//     7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 Address (base 16)
//     0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
//
//		C = carrier
//		D = data subcarriers
//		0 = unused subcarriers
//
// The data stored in these buffer is formatted so that the upper byte is the subcarrier magnitude
// and the lower byte is the subcarrier phase. Received phase is 5 bits with 11.2 degree resolution
// and transmitted phase is 4 bits with 22.5 degree resolution. Received magnitude is 7 bits with
// 1.5 dB resolution. Transmitted magnitude is 2 bits with 6 dB resolution. A value of 0 is a
// magnitude of 0 and 1 through 3 specify magnitudes of -12 to 0 dBFS.
//
//     15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//    |         Magnitude         | 0 |       Phase       | 0   0   0 | Receive
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//    |  Mag. |                       |     Phase     |               | Transmit
//    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
// The maximum clock rate is 132 MHz with 4 block RAMs, 2 multipliers and 888 slices used (19% XC3S500E).
// 1 multiplier may be disabled in Spartan-3E FPGAs due to buffer RAM use of shared resources.
//
//	History:
//		1-29-08	modified to use v5.0 Radix 2 Lite FFT with 18-bit data, 17-bit coef. and cyclic prefix generation
//		2-23-08	change subcarrier interface to 4-bit (TX) or 16-bit (RX) polar data with internal conversion
//		5-23-08	eliminated redundant external configuration registers
//		2-6-09	configure via CPU output pots and add DMA interface
//		2-20-09	allow 16PSK with 4 magnitude levels and use 10-bit memory address with three 256-word buffers
//		4-19-09	generate CP from from data in buffer RAM (saves 74 slices)
//		5-24-09	use run bit for multiplexing tuner inputs - add tov
//		6-14-09	4-bit magnitude with 3 dB steps - LSB of 1 is +3 dB
//		1-4-09	add sync and sof inputs to automatically start FFT
//		3-11-10	add DCD input and start and end flags
//		4-17-10	4 fixed 256 word buffers (saves 15 slices and simplifies driver software) and tx delay
//		5-4-10	gate RAM output to prevent turn-on spurs
//		5-12-10	add txen output
//		5-16-10	16-bit version (18-bits => 931 slices)
//		5-17-10	fixed scaling in FFT (was 908 slices)
//
// Normal Warnings:
//		Signals <phsin<3:0>> and <magin<5:0>> are assigned but never used.
//		Input <ioaddr<2>> is never used.
//		Latch bit <status<10>> has a constant value of 0
//
module dft256x16( iocs, ioaddr, din, iowr, dout, iord,
						mcs, maddr, min, mout, mwr, mclk,
						toe, tdox, tdoy, tov, ten,
						rdix, rdiy, riv,
						clk, rst, xmt, sof, sync, dcd, intr
						);
	 input iocs;					// I/O interface
	 input [2:0] ioaddr;
	 input [15:0] din;
	 input iowr;
	 output [15:0] dout;
	 input iord;
	 input mcs;						// memory interface
	 input [9:0] maddr;
	 input [15:0] min;
	 output [15:0] mout;
	 input mwr;
	 input mclk;
	 input toe;						// enable transmitter output
	 output [15:0] tdox,tdoy;	// transmitter rectangular data output
	 output tov;					// transmitter output valid
	 output ten;					// IFFT enable
    input [15:0] rdix,rdiy;	// receiver rectangular data input
    input riv;						// input data valid
    input clk,rst;				// master clock and reset
	 input xmt;						// receive = 0, transmit = 1
	 input sof;						// null symbol detected, start sampling
	 input sync;					// synchronize sample counter to symbol timing
	 input dcd;						// data carrier detect
	 output intr;					// FFT/IFFT done interrupt
// configuration and status registers
reg [3:0] fftsize;				// copy for address multipexer
reg [7:0] fftlen;					// number of samples in FFT
reg [6:0] cplen;					// cyclic prefix length
reg runrx,runtx;					// FFT/IFFT enables
wire run;
reg ensof,ensyn;					// FFT receive timing mode control
wire rstctr;						// reset to start of FFT/IFFT sample interval
wire rstbase;						// reset base address
reg txe;								// enable transmission
reg [2:1] txed;					// delasyed transmit enable
// buffer memory address busses and counter control signals
reg swap;							// swap buffer halves
wire ce;								// count enable
reg [8:0] sctr,sphs;				// port A sample counter and sync pulse position
wire [7:5] amask;					// port A address mask
wire [7:0] maddra,maddrb;		// port A and B addresses
wire tc;								// terminal count for address A
// FFT busses and control signals
reg [15:0] rx1,ry1,rx2,ry2;	// delay for time-ordered buffer memory input
wire [15:0] rx,ry,tx,ty;		// input from memory (R) and from constellation map (T)
wire [15:0] nx,ny;				// FFT input multiplexer
wire [15:0] kx,ky;				// FFT output
wire [7:0] magin,phsin;			// FFT input in polar form
wire [7:0] phsout,magout;		// FFT output in polar form
wire pmov;							// phase.magnitude output valid (RAM WE)
reg pmov1;							// delayed for RXF and TXF
wire fftdone;						// FFT completed
wire [7:0] naddr,kaddr;			// addresses from FFT logic
reg [1:0] base;					// base memory address
wire [1:0] base1;					// adjusted base address for I/O
reg [7:0] a1,a2,a3,a4,a5,a6;	// K index delay (to match R->P)
wire [7:0] a7;						// N or K index mux
reg [7:0] indx;					// subcarrier index
wire fftdv;							// address and data output valid
wire start;							// start FFT or IFFT
wire unload;						// start unloading FFT output buffer
reg [2:1] v;						// data valid delay
reg fftint,sf,ef;					// FFT/IFFT complete, start and end of frame flags
wire busy;							// FFT engine busy
reg [15:0] status;				// status output multiplexer
// decode addresses
wire wr0,wr1,wr2,wr3,wr4,rd1;
assign wr0 = iocs & (ioaddr == 0) & iowr;	// FFT size and transform type
assign wr1 = iocs & (ioaddr == 1) & iowr;	// FFT length and cyclic prefix length
assign wr2 = iocs & (ioaddr == 2) & iowr;	// enable transmission or reception
assign wr3 = iocs & (ioaddr == 3) & iowr;	// start transmission
assign wr4 = iocs & (ioaddr == 4) & iowr;	// load scaling schedule
assign rd1 = iocs & (ioaddr[0] == 1) & iord;		// read status bit
// Configuration Registers
always @ (posedge clk)
begin
	if (wr0) fftsize <= din[3:0];		// FFT size as binary exponent
	if (wr1) fftlen <= din[7:0];		// FFT length
	if (wr1) cplen <= din[14:8];		// CP length
	if (~ensof) runrx <= 0;				// controls FFT sampling for reception
	else runrx <= runrx | (ensof & sof);				
	if (rst) runtx <= 0;					// controls IFFT sampling for transmission
	else if (wr2) runtx <= din[0];
	if (rst) ensof <= 0;					// enables start input for FFT during reception
	else if (wr2) ensof <= din[2];
	if (rst) ensyn <= 0;					// enables external synchronization input
	else if (wr2) ensyn <= din[3];
	if (~runtx) txe <= 0;				// transmit enable
	else  txe <= wr3 | (txe & ~start);
	if (~runtx) txed <= 2'b00;			// delayed transmit enable
	else if (start) txed <= {txed[1],txe};
end
assign rstbase = wr2 & din[1];			// reset base address
assign rstctr = ~run | (ensyn & sync);	// reset counter
assign run = runtx|runrx;					// enable FFT
// Sample counter increments when receiver input present or transmitter output requested and is used
// to create memory address. Buffer halves swapped when counter reaches maximum (fftlen) and address
// set to negative of cyclic prefix (CP) length. note that the one's complement of the CP length is
// loaded so that the increment creates the two's complement. Sample counter and swap bit are reset
// until run bit set.
always @ (posedge clk)
begin
	if (~run) swap <= 1'b0;		// buffer swap bit toggles when current half fills
	else if (ce) swap <= tc ^ swap;
	if (rstctr) sctr <= 0; 		// sample counter
	else if (ce) sctr <= (tc ? {2'b11,~cplen} : sctr) + 1;
	v <= {v[1],xmt & toe};
end
assign ce = xmt ? toe : riv;
assign tc = (sctr == {1'b0,fftlen});
assign start = ce & tc;
assign tov = v[2];
// mask address to match FFT length (for transmitting CP)
//  Length:  32  64 128 256
//    Mask: 000 001 011 111
assign amask[5] = (fftsize > 5);
assign amask[6] = (fftsize > 6);
assign amask[7] = (fftsize > 7);
assign maddra = {(sctr[7:5] & amask[7:5]),sctr[4:0]};
assign maddrb = xmt ? kaddr : naddr;	// FFT side address
// signal buffer memory - 512 x 32
// port A is I/O and port B is FFT
RAMB16_S36_S36 tram (
	.ADDRA({swap,maddra}),	// I/O address
	.ADDRB({~swap,maddrb}),	// internal address
	.DIA({rdix,rdiy}),		// data input
	.DIPA(4'hF),
	.DIB({kx,ky}),				// IFFT output
	.DIPB(4'hF),
	.DOA({tdox,tdoy}),		// data output
	.DOPA(),
	.DOB({rx,ry}),				// FFT input
	.DOPB(),
	.CLKA(clk),					// I/O clock
	.CLKB(clk),					// internal clock
	.WEA(riv & ~sctr[8]),	// write received data when not in CP
	.WEB(xmt & fftdv),		// write tranmitter data
	.ENA(1'b1),
	.ENB(1'b1),
	.SSRA(~txed[2]),			// delayed transmit enable
	.SSRB(1'b0)
	);
// FFT multiplexer and input delay - 2 clocks added to RAM delay
always @ (posedge clk)
begin
	rx1 <= rx;
	ry1 <= ry;
	rx2 <= rx1;
	ry2 <= ry1;
end
assign nx = xmt ? tx : rx2;
assign ny = xmt ? ty : ry2;
// Transform rectangular signal between time and frequency domains using 16-bit data
// path and 16-bit coefficient (twiddle factor) size. Start FFT or IFFT when buffer
// swap occurs.
fft256x16 fft (
	.xn_re(nx),					// data input
	.xn_im(ny),
	.xn_index(naddr),
	.xk_re(kx),					// data output
	.xk_im(ky),
	.xk_index(kaddr),
	.start(start & (dcd|xmt)),	// control signals
	.unload(unload),
	.rfd(),
	.busy(busy),
	.dv(fftdv),
	.edone(unload),
	.done(),
	.fwd_inv(din[15]),		// configuration and reset
	.fwd_inv_we(wr0),
	.nfft(din[4:0]),
	.nfft_we(wr0),
	.scale_sch(din),
	.scale_sch_we(wr4),
	.clk(clk)					// master clock
	);
// convert input from polar to Cartesian form - 1 clock delay
// generates 16PSK at -54 to -15 dBFS
p2r14 p2r (
	.mag(magin[7:4]),	// 4 bits of magnitude
	.phs(phsin[7:4]),	// 4 bits of phase
	.dox(tx[13:0]),	// 14 x 14 bit output
	.doy(ty[13:0]),
	.clk(clk)
	);
assign tx[15:14] = {tx[13],tx[13]};
assign ty[15:14] = {ty[13],ty[13]};
// convert output from Cartesian to polar form - 7 clock delay
rect2polar16 r2p (
	.dix(kx),
	.diy(ky),
	.iv(fftdv & ~xmt),
	.mag(magout),		// polar output
	.phs(phsout),
	.ov(pmov),
	.clk(clk),
	.rst(rst)
	);
// multiplex transmit and receive subcarrier addresses and set delay
// FFT output synchronous with data on receive - 2 clock cycles ahead on transmit
// also extend subcarrier index sign and invert MSB to center in buffer
always @ (posedge clk)
begin
	a1 <= kaddr;
	a2 <= a1;
	a3 <= a2;
	a4 <= a3;
	a5 <= a4;
	a6 <= a5;
	case (fftsize[1:0])
	0: indx[7:5] <= {~a7[7],a7[6],a7[5]}; // 256 => 8
	1: indx[7:5] <= {~a7[4],a7[4],a7[4]}; // 32 => 5
	2: indx[7:5] <= {~a7[5],a7[5],a7[5]}; // 64 => 6 
	3: indx[7:5] <= {~a7[6],a7[6],a7[5]}; // 128 => 7
	default indx[7:5] <= 3'bxxx;
	endcase
	indx[4:0] <= a7[4:0];
end
assign a7 = (xmt ? naddr : a6);
// increment base when FFT starts
always @ (posedge clk)
begin
	if (rst|rstbase) base <= 0;
	else if (xmt ? start : fftdone) base <= base + 1;
end
// signal buffer memory - 1024 x 16 - port A is I/O and port B is FFT
RAMB16_S18_S18 sram (
	.ADDRA(maddr),	// DMA port
	.DIA(min),
	.DIPA(2'b11),
	.DOA(mout),
	.CLKA(mclk),
	.WEA(mwr),
	.ENA(mcs),
	.SSRA(1'b0),
	.ADDRB({base,indx}),	// FFT port
	.DIB({magout,phsout}),
	.DIPB(2'b11),
	.DOB({magin,phsin}),
	.CLKB(clk),
	.WEB(pmov),
	.ENB(1'b1),
	.SSRB(1'b0)
	);
// generate strobe when FFT or IFFT complete and set interrupt flip-flop
// set end flip-flop when sample period done and no DCD
// latch sample counter value when sync occurs
// latch status bits for I/O read to minimize propagation delay (IORD 1 clock long)
// offset base by +1 for transmit and -1 for receive for I/O operations
assign base1 = base + {~xmt,1'b1};
always @ (posedge clk)
begin
	pmov1 <= pmov;
	if (rst) fftint <= 0;
	else fftint <= (xmt ? start : fftdone) | (fftint & ~rd1);
	if (rst) sf <= 0;
	else sf <= (ensof & sof) | (sf & ~rd1);
	if (rst) ef <= 0;
	else ef <= (start & ~dcd) | (ef & ~rd1);
	if (sync) sphs <= sctr;
	status <= ioaddr[0] ? {fftint,busy,ef,run,sf,2'b00,sphs} : {6'b001000,base1,8'h00};
end
assign fftdone = pmov1 & ~pmov;
// Connect outputs and set base address to 2000
assign dout = status;
assign intr = fftint;
assign ten = runtx;
endmodule
