//
//			dds28.v - 32-bit Direct Digital Frequency Synthesizer
//
//					(C) Copyright 2004-2010 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// The 32-bit DDS frequency is set by FI. DOXY is the 18-bit sequential cosine and
// sine waveform with cosine present while SCLK is 1 and sine when SCLK is 0. 
//
// The DDS uses a 1024-entry ROM with 18-bit amplitude samples of a sine wave over 360
// degrees of phase. Spurious signals due to truncation of the amplitude to 18 bits
// are -6*b - 20 = -128 dBc. Spurs due to phase truncation would be -6*b = -60 dBc but
// the table entries are are linearly interpolated to produce an output with about
// -112 dBc worst-case spurs. Calculations use a 10-stage pipeline:
// 
//	Phase	Action
//	  -1	Increment lower phase accumulator
//		0	Increment upper phase accumulator
//		1	Add 0 or 90 degrees
//		2	Generate addresses for adjacent entries
//		3	Look up sine wave in ROM
//		4  Latch ROM output
//		5	Subtract next from current entry
//		6	Multiply by phase difference
//		7  (continued)
//		8	Round product (offset from lower entry)
//	   9	Interpolate between entries by adding offset	
// 
// 116 slices plus 1 RAM are used. Maximum clock speed is 203 MHz for DCLK and 194 MHz
// for SCLK. Longest delay is 18-bit rounder. 
// 
// Normal warnings:
//		Signals <adj7<35>> and <adj7<16:0>> are assigned but never used.
// 
// History:
//		M001	6-15-08	adapt to ISE 10.1 and sync. cosine/sine output to half-freq. clock
//		M002	1-22-10	use -cos to eliminate inversion on receive path (adder in transmit path)
//
module dds28(doxy,sclk,dclk,rst,frq);
    output [17:0] doxy;	// DDS sequential I and Q amplitude output
    input sclk;			// sample clock
	 input dclk;			// master clock
	 input rst;				// reset accumulator
    input [31:0] frq;	// frequency input - must be synchronized to SCLK
// internal signals
reg [17:0] lo9;			// LO output (9 clock delay)
reg [31:0] phase;			// phase accumulator
reg [31:5] p0;				// delayed phase
reg [21:5] p1b;			// delay lower phase accumulator output
reg [29:22] p1m;			// delayed phase accumulator output
reg [31:30] p1t;			// adjusted phase accumulator output
reg [31:22] p2a,p2b;		// ROM address and address+1 input
reg [21:5] p2,p3,p4,p5;
wire [17:0] ea4,eb4;		// ROM entries: base and next
reg [17:0] ea5,ea6,ea7,ea8;	// delayed base ROM entry
reg [17:0] dif5;			// difference between ROM entries
wire [35:0] adj7;			// interpolation adjustment
reg [17:0] adj8;			// rounded adjustment
// increment phase accumulator at half data output rate
always @ (posedge sclk)
begin
	if (rst) phase <= 0;
	else phase <= phase + frq;
end
// 10-bit incrementer addresses next ROM entry
// 2-bit incrementer adjusts phase by 90 degrees
always @ (posedge dclk)
begin
	// delayed phase until negative edge of SCLK
	if (sclk) p0[31:5] <= phase[31:5];
	// adjust phase by 0 or Pi/2
	p1b <= p0[21:5];						// bottom 17 bits
	p1m <= p0[29:22];						// middle 8 bits
	p1t <= p0[31:30] + {~sclk,~sclk}; // top 2 bits used to select sin/-cos
	// add 1 to upper 11 bits to form 2 addresses
	p2a <= {p1t,p1m};							// port A addrerss
	p2b <= {p1t,p1m} + 10'b0000000001;	// port B addrerss
	// delay lower bits by 3 clocks to match subtractor output
	p2 <= p1b;	// ROM address
	p3 <= p2;	// ROM output
	p4 <= p3;	// ROM latch output
	p5 <= p4;	// subtractor output
end
// sine ROM - 10-bit phase input, 18-bit amplitude output
// port A is base entry and port B is next entry
// ROM consists of two 2Kx9 block RAM - 2 clock delay
Sine1k2 ddsrom (
	.aa(p2a),	// address in phase 2
	.ab(p2b),
	.clk(dclk),	// master clock
	.rst(1'b0),
	.da(ea4),	// data output in phase 3
	.db(eb4)
	);
// latch outputs and generate difference between entries - sync with p4
always @ (posedge dclk)
begin
	dif5 <= eb4 - ea4;
end
// multiply low-order phase bits by difference between entries
MULT18X18SIO #(
	.AREG(1),		// instantiate all registers - 2 clock delay
	.BREG(1),
	.PREG(1)
	) mul (
	.A(dif5),		// difference between adjacent entries
	.B({1'b0,p5}),	// lower phase bits
	.P(adj7),		// amount to adjust ROM output (p6)
	.CLK(dclk),
	.CEA(1'b1),		// enable all registers
	.CEB(1'b1),
	.CEP(1'b1),
	.RSTA(1'b0),	// don't reset
	.RSTB(1'b0),
	.RSTP(1'b0)
	);
// delay lower entry to account for subtractor and multiplier
// interpolate ROM output by adding difference between entries
// scaled by fractional address bits to base entry
// round result towards zero (truncate positive result and increment negative results)
always @ (posedge dclk)
begin
	ea5 <= ea4;	// sync with dif5 (difference between entries)
	ea6 <= ea5;	//  
	ea7 <= ea6;	// sync with adj7 (multiplier output)
	ea8 <= ea7; // sync with adj8 (rounded)
	adj8 <= adj7[34:17] + {17'b00000000000000000,adj7[34]};
	lo9 <= ea8 + adj8;	// add interpolation adjustment to lower entry
end
// connect outputs
assign doxy = lo9;
endmodule
