//
//							16-bit Arithmetic Logic Unit
//
//					(C) Copyright 2007-2010 John B. Stephensen
//
// This Verilog source file and all its derivatives are licensed only for
// personal non-profit educational use in the Amateur Radio Service and
// the license is not transferrable. The information is provided as-is for
// experimental purposes and the author does not warranty its freedom
// from defects or its suitability for any specific application.
//
// The ALU has 8 inputs:
//		A - Data from read/write port of register file
//		B - Data from read-only port of register file
//		C - 8-bit constant value from insturuction field
//		D0 - Data from I/O ports (source determined externally)
//		D1 - Data from memory (source determined externally)
//		D7 - Data from instruction (source determined externally)
//		OP - operation type (output multiplexer selecting logic block)
//		MOD - operation modifier (input to logic block)
//
// The IV input signals that data is valid. The instruction to be executed is encoded in
// the OP and MOD inputs. OP selects a functional unit to carry out the instruction by
// selecting the output of 1 of 8 units. MOD specifies the type of operation to be carried
// out in the selected functional unit. The following functions are provided:
//
//		1) 16-bit arithmetic with register A and register B
//		2) Increment and decrement register A
//		3) Complex 8x8-bit arithmetic with register A and register B 
//		3)	16-bit logical operations on register A and register B
//		4) Single bit operations on bits in register A
//		5) Shift register A right or left by 1 bit
//		6) Copy 8 bit field from register B to register A and optionally extend sign
//		7) 16x16-bit multiply and 32-bit accumulate
//		8) 16-bit integer and fractional division
//		9) Load data from memory, I/O port or instruction field
//
//	The ALU executes the following arithmetic and logical instructions:
//
//		 15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 0   1 | 1   1 |             Data              |   Register A  | MVI
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 0   0 |             Data              |   Register A  | ADI
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 0   1 |             Data              |   Register A  | CPI
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   0   0 |    MOD    |   Register B  |   Register A  | MUL
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   0   1 | 0   0 |MOD|   Register B  |   Register A  | DIV
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   0   1 | 1   0   0 |   Bit Select  |   Register A  | TST
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   0   1 | 1   0   1 |             Data              | LDH
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   0   1 | 1   1   0 | x   x   x | C | x   x   x   x | STC/CLC
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   0 | 1   1   0 |MOD| 1   1 |   Register B  |   Register A  | COMP
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 0   1   0 |    MOD    | Reg. B/Const. |   Register A  | 16-bit Math
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 0   1   1 | 0 |  MOD  | Reg. B/Const. |   Register A  | 8-bit Math
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 0   1   1 | 1 |  MOD  | Reg. B/Const. |   Register A  | 4-bit Math
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 1   0   0 |    MOD    | Reg. B/Const. |   Register A  | Logic
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 1   0   1 |    MOD    |   Register B  |   Register A  | SXL/SXH/RR4/RR8/RR12/REV
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 1   0   0 | 1   1   0 |      MOD      |   Register A  |	SHR
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 1   0   0 | 1   1   1 |      MOD      |   Register A  |	SHL
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//		| 1   1 | 1   1   0 |   MOD     | x   x   x   x |   Register A  | LOAD P/Q/R
//		+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//
// With multipler and divider: 248 slices used (5% XC3S500E). 106 MHz maximum clock rate.
// No multipler or divider: 172 slices used (3% XC3S500E). 106 MHz maximum clock rate.
// Maximum combinatorial logic delay is in 16-bit adder/subtracter.
//
// Normal warnings:
//		Signals <prod<35:34>> and <prod<1:0>> are assigned but never used.
//
// History:
//		M001	4-16-08	Reduced propogation delay by adding duplicate arithmetic and logic units.
//		M002	6-8-08	Removed convolutional encoder (more efficient as I/O device)
//		M003	8-5-08	Moved output multiplexer ports to allow 16-bit instruction format
//		M004	8-9-08	32-bit unsigned divider logic replaces 16-bit fractional divider
//		M005	8-12-08	Removed barrel shifer and replaced with byte swap and bit shifter (-112 slices)
//		M006	9-29-08	Allow comparison and bit test operations by inhibiting storage in register
//		M007	3-24-09	Added mask instruction
//		M008	3-28-08	Add set/reset carry and link bit instructions - move w/o op decode to this module
//		M009	7-13-09	Add overflow bit and use carry bit in shifter (no link bit)
//		M010	7-16-09	Fixed rotate left bug
//		M011	7-18-09	replace complex math unit with immediate math unit for ADI and CPI with 8-bit constants
//		M012	3-21-10	fix carry/zero enable logic
//		M013	3-25-10	change comparison op codes
//
module ALU16(a, b, d0, d1, pdata, zero, carry, ovflo, clk, rst);
    output [15:0] a,b;			// data from registers A and B
	 input [15:0] d0,d1;			// data from I/O ports and memory
	 input [15:0] pdata;			// instruction
	 output zero,carry,ovflo;	// zero, carry and overflow flags for math and shift operations
	 input clk,rst;				// clock and reset for carry bit and multiplier
// multiplier/divider generation control
`define muldiv
// internal signals
wire [15:0] c;		// 16-bit constant
wire [7:0] id;		// 8-bit immediate data
reg hv;				// last instruction was high byte
reg [7:0] hd;		// saved high byte value
wire cp;				// propogate carry flag
reg ci;				// carry flag for adder/subtractor
wire co,coi;		// carry out bit adder/subtractor
wire zr,zi;			// zero detector outputs
reg z;				// arithmetic zero flag
wire [3:0] bitsel;// bit select
wire bitmux;		// selected bit
reg v;				// overflow bit
wire sv;				// shift enable
wire si;				// shift end bit
wire [2:0] fsel;	// function multiplexer select input
wire [15:0] f2,f3,f4,f5,f6;	// ALU functional unit outputs
wire [15:0] f;		// function multiplexer output
wire ldh,mvi,adi,cpi,cmp,tst,mv,dv,csr;	// decoded operations
wire math,imath,rmath,shift;
wire aluop;			// ALU operations
wire [3:0] we;		// write to register A (by nibble)
wire [1:0] typ;	// instruction type
wire [2:0] op,mod;// ALU operation and modifier
wire [3:0] aa,ab;	// register address fields
// assign fields for load and data manipulation instuctions
assign aa = pdata[3:0];						// A register address
assign ab = pdata[7:4];						// B register address
assign mod = pdata[10:8];					// operation modifier
assign op = pdata[13:11];					// operation code field
assign typ = pdata[15:14];					// instruction type field
assign id = pdata[11:4];					// immediate data field
// decode write-only operations
assign mvi = (typ == 1) & (op[2:1] == 3);					// move immediate data
assign adi = (typ == 2) & (op[2:1] == 0);					// add immediate data
assign cpi = (typ == 2) & (op[2:1] == 1);					// compare immediate data
assign mv = (typ == 2) & (op == 4);							// multiply operations
assign dv = (typ == 2) & (op == 5) & (mod[2:1] == 0);	// divide operations (mod = 0 or 1)
assign tst = (typ == 2) & (op == 5) & (mod == 4);		// bit testing operations
assign ldh = (typ == 2) & (op == 5) & (mod == 5);		// load high byte
assign csr = (typ == 2) & (op == 5) & (mod == 6);		// carry bit operations
assign cmp = (typ == 2) & (op == 5) & (mod[1:0] == 3); // comparison operations (mod = 3 or 7)
assign math = (typ == 3) & (op == 2) & (mod != 0);	// register negate/add/subtract operations
assign shift = (typ == 3) & (op == 5) & (mod[2:1] == 2'b11); // shift left/right
assign aluop = (typ == 3);										// ALU read/write operation
// ALU does arithmetic, logical and shifting operations in 1 clock cycle
// first MVI loads LSB and extends sign and second MVI loads MSB without modifying LSB
always @ (posedge clk)
begin
	hv <= ldh;			// high byte valid
	hd <= pdata[7:0];	// high byte value
end
assign c = {(hv ? hd : {id[7],id[7],id[7],id[7],id[7],id[7],id[7],id[7]}),id};
// Dual-port memory containing data (A/B) and address (B) registers
// update registers on all ALU operations except memory writes and
// on move and add immediate instructions (2nd cycle of memory reads)
assign we[0] = mvi | adi | aluop;
assign we[1] = we[0] & ((op != 3) | ~mod[2]);	// nibble-wide operations excluded
assign we[2] = we[1] & (op != 3);	// byte and nibble-wide operations excluded
assign we[3] = we[2];
RAM16X16D regfile (
	.AA(aa),		// 1st operand address
	.AB(ab),		// 2nd operand address
	.DA(f),		// 1st operand input
	.YA(a),		// 1st operand output
	.YB(b),		// 2nd operand output
	.WE(we),		// write results to high aand low bytes
	.WCLK(clk)	// write clock (read is asynchronous)
	);
// 16-bit adder or subtractor using registers A and B or B only
// propogate carry only for 16-bit math
assign cp = (op == 2) & mod[2];
addsubmovneg16a aur (
	.A(a),						// accumulator register
	.B(b),						// another register
	.CI(cp ? ci : mod[0]),	// carry input is zero or previous output
	.CO(co),						// carry output
	.SUB(mod[0]),				// select add (0) or subtract (1)
	.ENA(mod[1]),				// select zero or A input
	.Y(f2)						// result
	);
// detect zero output
zero16 zdr (
	.A(f2),
	.Y(zr)
	); 
// carry flag is set/reset by carry out bit during register-to-register math
// carry flag is set/reset by bit multiplexer data when shifting or testing bits
// check for zero and 2's complement overflow during 16-bit math
assign rmath = cmp | math;	// register to register math operations
assign imath = adi|cpi;		// immediate data to register math operations
assign sv = tst | shift;	// bit manipulation operations
always @ (posedge clk)
begin
	if (rst) ci <= 1'b0;
	else if (rmath|imath|sv|csr) ci <= (rmath & co)|(imath & coi)|(sv & bitmux)|(csr & c[0]);	// carry out or shift out
	if (rst) z <= 1'b0;
	else if (rmath|imath) z <= rmath ? zr : zi;	// save zero result for add, subtract and compare operations
	if (rst) v <= 1'b0;
	else if (rmath) v <= (a[15] & b[15] & ~f2[15])|(~a[15] & ~b[15] & f2[15]);	// overflow
end
assign carry = ci;
assign zero = z;
assign ovflo = v;
// 16-bit adder or subtractor using register A and constant
addsubmovneg16a aui (
	.A(a),		// accumulator
	.B(c),		// immediate data
	.CI(cpi),	// set carry for subtract
	.CO(coi),
	.SUB(cpi),	// subtract during comparison
	.ENA(~mvi),	// add to zero for move immediate
	.Y(f3)
	);
// detect zero output
zero16 zdi (
	.A(f3),
	.Y(zi)
	);
// logical operations on one bit in register A - bit select, reset, set or toggle
lu16a lu (
	.a(a),
	.b(b),
	.c(c[3:0]),
	.op(mod),
	.y(f4)
	);
// Select bits to test or append to shifted word
assign bitsel = tst ? c : {c[0],c[0],c[0],c[0]};	// select bit 0 or bit 15 when shifting
MUX16 smux1 (
	.D(a),		// sample reg. A bits
	.S(bitsel),	// select bit
	.Y(bitmux)	// output to carry flag
	);
// shift logic for end bits
MUX4 smux16 (
	.D({c[1],ci,a[15],a[0]}),	// select 0, 1, carry bit, bit 15 or bit 0 (M009)
	.S(c[3:2]),						// inst. bits 7-6 select shifter input
	.Y(si)
	);
// shift multiplexer
MUX8X16 dmux (
	.D0({b[7],b[7],b[7],b[7],b[7],b[7],b[7],b[7],b[7:0]}),				// extend sign of lower byte
	.D1({b[15],b[15],b[15],b[15],b[15],b[15],b[15],b[15],b[15:8]}),	// extend sign of upper byte
	.D2({b[3:0],a[15:4]}),		// rotate right and insert 4 bits
	.D3({b[7:0],a[15:8]}),		// rotate right and insert 8 bits
	.D4({b[11:0],a[15:12]}),	// rotate right and insert 12 bits
	.D5({b[0],b[1],b[2],b[3],b[4],b[5],b[6],b[7],b[8],b[9],b[10],b[11],b[12],b[13],b[14],b[15]}), // reverse bits
	.D6({si,a[15:1]}),			// shift right by 1 bit
	.D7({a[14:0],si}),			// shift left by 1 bit
	.S(mod[2:0]),
	.Y(f5)
	);
// output multiplexer - use optimized multiplxers
// operation field selects port except for immediate operations
assign fsel = (mvi|adi|cpi) ? 3'b011 : ((op == 3'b011) ? 3'b010 : op);
MUX8X16 fmux (
	.D0(d0),		// I/O ports
	.D1(d1),		// memory
	.D2(f2),		// register arithmetic
	.D3(f3),		// immediate arithmetic
	.D4(f4),		// logic
	.D5(f5),		// shift
	.D6(f6),		// multiply/divide
	.D7(16'hFFFF),		// unused
	.S(fsel),
	.Y(f)
	);
// select whether multiplier and divider are generated
`ifdef muldiv
// internal signals
wire [31:0] sum;	// sum of products
wire [15:0] q,r;	// quotient and remainder from divider
// multiplier/accumulator
wire [35:0] prod;	// full product
reg mv0;				// latch sum of products
reg mod0,mod1;		// delay operation modifier
// multiplier - 1 clock delay
MULT18X18SIO #(
	.AREG(0),		// enable output register only
	.BREG(0),
	.PREG(1)
	) mult (
	.A({mod[2] & a[15],a,1'b0}),	// trim inputs to 16 bits and
	.B({mod[2] & b[15],b,1'b0}),	// select signed or unsigned inputs
	.P(prod),
	.CLK(clk),
	.CEP(mv),		// save product
	.RSTP(1'b0)		// never reset
	);
// delay data valid and modifier by 1 clock to match product
always @ (posedge clk)
begin
	mv0 <= mv;
	mod0 <= mod[0];
	mod1 <= mod[1];
end
// accumulator - no delay on output - data registered internally
// load accumulator during multiplication if MUL or MULN instruction
// add or subtract during MAC or MSUB instruction
acc32a acc (
	.B(prod[33:2]),
	.SUB(mod0),
	.ACC(mod1),
	.Y(sum),
	.CE(mv0),
	.CLK(clk),
	.SCLR(1'b0)
	);
// 16-bit divider
div16 div (
	.ain(a),			// numerator and denominator input
	.bin(b),
	.iv(dv),			// load data and start
	.mod(mod[0]),	// 1=division with remainder, 0=generate fractional remainder
	.qout(q),		// quotient
	.rout(r),		// remainder
	.clk(clk),
	.rst(rst)
	);
// Retreive product, quotient and remainder 
MUX4X16 pqrmux (
	.D0(sum[15:0]), 
	.D1(sum[31:16]), 
	.D2(q), 
	.D3(r), 
	.S(mod[1:0]), 
	.Y(f6)
	);
`else
// No multipler or divider
assign f6 = 16'hFFFF;
`endif
endmodule
