******************************************************************************
* (c) 2003 University of Florida
* Computational Neuro Engineering Laboratory
*    www.cnel.ufl.edu
* Applied Digital Hardware Research Laboratory
*    www.add.ece.ufl.edu
*
* Author:  Scott A. Morrison
* E-Mail:  scott@cnel.ufl.edu
*
* Advisor: Dr. Jose Principe (CNEL)
* E-Mail:  principe@cnel.ufl.edu
*
* Advisor: Dr. Karl Gugel (ADD Lab)
* E-Mail:  gugel@ecel.ufl.edu
*
* Date Created:		31 January 2003
* Date Modified:	17 March 2003
*
******************************************************************************
*	filename = nlms.asm
*	Normalized Least Mean Squared with Weight Decay
*	Purpose: To perform realtime NLMS processing on neural data.
*	Inputs = 105 Dimensionsl neural data (1 dimension is for bias)
*	Ouputs = 3 dimensional hand position
******************************************************************************

	.text						; program section
	.include "lms_glb.asm"		; global variables file        

LMS_START						; begin LMS processing of BMI data
	LDI		1000h, ST			; clear the cache, just in case
	OR		00100000000000b,ST	; enable the cache
	
	LDP		memi_addr			; page becomes internal SRAM
	OR		00000010b,IOF   	; make XF0 Output
	OR		00100000b,IOF		; make XF1 Output	
                   
******************************************************************************
*	Initialize address pointers to all Matrices and Vectors
*	
START
 	XOR		01000000b,IOF   	; TOGGLE XF1 
 	
	LDI		@X_ADDR,AR3			; reset x pointer
	STI		AR3,@X_PTR			; start at x(1)
	
	LDI		@Y_ADDR,AR3			; reset y pointer
	ADDI	(3*(LAGS-1)),AR3	; start at y(10)
	STI		AR3,@Y_PTR			; 
	
	LDI		@E_ADDR,AR3			; reset e pointer
	ADDI	(3*(LAGS-1)),AR3	; start at e(10)
	STI		AR3,@E_PTR			; 
	
	LDI		@D_ADDR,AR3			; reset d pointer
	ADDI	(3*(LAGS-1)),AR3	; start at d(10)
	STI		AR3,@D_PTR			; 

******************************************************************************
*	Prepare for LMS Loop
*
	LDI		(WIN-LAGS),AR0
	STI		AR0,@MYCOUNT	; Load main counter with 90 (=100-10)

******************************************************************************
*	Calculate the Output Y(n)=W(n)'*X(n)
*	

LOOP1
	XOR		00000100b,IOF   ; TOGGLE XF0 for timing
	
	LDI		@X_PTR,AR3		; Load current X pointer
	LDI		@Wx_ADDR,AR4	; AR4 <= Address of Wx
	LDI		@Wy_ADDR,AR5	; AR5 <= Address of Wy
	LDI		@Wz_ADDR,AR6	; AR6 <= Address of Wz

	LDF		0.0,R0			; Clear R0
	LDF		0.0,R1			; Clear R1
	LDF		0.0,R2			; Clear R2
	LDF		0.0,R3			; Clear R3
	LDF		0.0,R4			; Clear R4
	LDF		0.0,R5			; Clear R5
	STF		R0,@POWER		; Zero the power cumulation
	
		LDI		(LAGS*CHAN-1),RC	; Get ready to repeat 10*104 times
		RPTB	HERE1				; Repeat this block to calculate output

		MPYF3	*AR4++,*AR3,R0		; Wx*X parallel multiply and accumulate
||		ADDF3	R0,R2,R2		
		MPYF3	*AR5++,*AR3,R1		; Wy*Y parallel multiply and accumulate	
||		ADDF3	R1,R3,R3		

		MPYF3	*AR6++,*AR3,R4		; Wz*Z non-parallel multiply and accumulate	
		ADDF	R4,R5	
	
		MPYF3	*AR3,*AR3++,R7		; accumulate the power of the input
		ADDF	@POWER,R7
HERE1	STF		R7,@POWER
	
		ADDF	R0,R2				; do last accumulate
		ADDF	R1,R3				; do last accumulate
									; last accumulate not necessary for Wz
	
*	We went through the whole tap-input, now store results to Y(n)
	LDI		@Y_PTR,AR0		; load Y pointer
	STF		R2,*AR0++		; store Yx(n)
	STF		R3,*AR0++		; store Yy(n)
	STF		R5,*AR0++		; store Yz(n)
	STI		AR0,@Y_PTR		; store Y pointer back to memory for next time
							; next, error calculation will use Y from R2,R3,R5

******************************************************************************
*	Calculate the Error E(n)=D(n)-Y(n)
*	
	LDI		@E_PTR,AR0		; AR0 --> e(n)
	LDI		@D_PTR,AR6		; AR6 --> d(n)
	SUBF	R2,*AR6++,R7	; ex(n) = dx(n)-yx(n)
	STF		R7,*AR0++
	SUBF	R3,*AR6++,R7	; ey(n) = dy(n)-yy(n)
	STF		R7,*AR0++
	SUBF	R5,*AR6++,R7	; ez(n) = dz(n)-yz(n)
	STF		R7,*AR0++
	
	STI		AR6,@D_PTR		; store changed D pointer for next time
							; don't store new E pointer, need it later
	
******************************************************************************
*	Update the Weight Matrix
*

*	Find 1/(1+q)
	LDF		@POWER,R0		; load the power of x(n)
	ADDF	1.0,R0			; R0=R0+1.0 (R0=q+1)
	CALL	FPINV			; R0=1/R0 (R0=1/(1+q))
	STF		R0,@POWER		; store back to memory

	LDI		@X_PTR,AR3		; Load current X pointer
	LDI		@Wx_ADDR,AR4	; AR4 --> Wx
	LDI		@Wy_ADDR,AR5	; AR5 --> Wy
	LDI		@Wz_ADDR,AR6	; AR6 --> Wz
	LDI		@E_PTR,AR7		; AR7 --> e(n) 
	
	LDF		*AR7++,R3		; R3 <= ex(n)
	LDF		*AR7++,R4		; R4 <= ey(n)
	LDF		*AR7++,R5		; R5 <= ez(n)
	STI		AR7,@E_PTR		; save new E pointer to memory
	
	LDF		@POWER,R6		; R6 < = 1/(1+q)
	MPYF	@ETA,R6			; R6 <= ETA * 1/(1+q)
	LDF		@DECAY,R7		; R7 <= (1-DECAY)

	LDI		(LAGS*CHAN-1),RC	; Get ready to repeat 10*104 times
	RPTB	HERE2			; Repeat this block to update weights

	LDF		*AR3++,R2		; R1 <= x(n)
	
	MPYF3	R2,R3,R0		; R0 <= x(n)*ex(n)
	MPYF	R6,R0			; R0 <= ETA * 1/(1+q) * x(n)*ex(n)
	MPYF3	R7,*AR4,R1		; R1 <= (1-DECAY) * Wx
	ADDF	R1,R0			; R0 <= (1-DECAY) * Wx + ETA * 1/(1+q) * x(n)*ex(n)
	STF		R0,*AR4++		; update weight in memory, increment pointer
	
	MPYF3	R2,R4,R0		; R0 <= x(n)*ey(n)
	MPYF	R6,R0			; R0 <= ETA * 1/(1+q) * x(n)*ey(n)
	MPYF3	R7,*AR5,R1		; R1 <= (1-DECAY) * Wy
	ADDF	R1,R0			; R0 <= (1-DECAY) * Wy + ETA * 1/(1+q) * x(n)*ey(n)
	STF		R0,*AR5++		; update weight in memory, increment pointer

	MPYF3	R2,R5,R0		; R0 <= x(n)*ez(n)
	MPYF	R6,R0			; R0 <= ETA * 1/(1+q) * x(n)*ez(n)
	MPYF3	R7,*AR6,R1		; R1 <= (1-DECAY) * Wz
	ADDF	R1,R0			; R0 <= (1-DECAY) * Wz + ETA * 1/(1+q) * x(n)*ez(n)
HERE2	STF		R0,*AR6++		; update weight in memory, increment pointer

	LDI		@X_PTR,AR0		; load current X pointer		
	ADDI	(CHAN),AR0		; move pointer to next input time
	STI		AR0,@X_PTR		; store X pointer for next use

******************************************************************************
*	Done with this iteration, loop back for next iteration
*
	LDI		@MYCOUNT,AR0		; retrieve counter from memory
	SUBI	1,AR0
	CMPI	0,AR0
	BGED	LOOP1
	STI		AR0,@MYCOUNT		; put counter back into memory
	NOP
	NOP
*	Delayed Branch happens here!

	BR		START			; Loop forever for timing
	
	RETSU			; return to DSP OS
LMS_END				; end of LMS routine

*************************************************************************
************************* END OF EXECUTABLE CODE ************************
*************************************************************************

*************************************************************************
*	Stored in Internal Memory
*************************************************************************
	.data			; initialized data section
	
MSK	.word	0FF7FFFFFH	; for fp_invert routine

FP_TABLE
	.word	0FF800000h	; table for C33FP <==> IEEEFP conversion
	.word	0FF000000h
	.word	07F000000h
	.word	080000000h
	.word	081000000h
	.word	07F800000h
	.word	000400000h
	.word	0007FFFFFh
	.word	07F7FFFFFh

* Global declarations for LMS data
	.global	X,X_END,Y,Y_END,E,E_END,D,D_END,X_PTR,MYCOUNT

WIN		.set	200		; Window size (number of 300ms time samples)
LAGS	.set	10		; Number of time lags for each computation
CHAN	.set	(104+1)	; Number of neuron channels (1 is for bias)
WINIT	.set	0.001	; Weight initialization

MYCOUNT	.word	0		; Main counter for LMS loop

Wx_ADDR	.word	Wx		; Constant pointer to top of Wx
Wy_ADDR	.word	Wy		; Constant pointer to top of Wy
Wz_ADDR	.word	Wz		; Constant pointer to top of Wz
X_ADDR	.word	X		; Constant pointer to top of X
Y_ADDR	.word	Y		; Constant pointer to top of Y
D_ADDR	.word	D		; Constant pointer to top of D
E_ADDR	.word	E		; Constant pointer to top of E

X_PTR	.word	X		; Changeable pointer to X
D_PTR	.word	D		; Changeable pointer to D
Y_PTR	.word	Y		; Changeable pointer to Y 
E_PTR	.word	E		; Changeable pointer to E

POWER	.float	0.0		; Power of the input at each time

ETA		.float	0.25		; LMS step size
DECAY	.float	(1-0.005)	; Weight decay parameter

Y 							; Output Vector, initialize to zeros
		.loop (WIN*3)
		.float 0.0
		.endloop
Y_END

D		.include "d.asm"	; Desired Vector, monkey data

E 							; Error Vector, initialize to zeros
		.loop (WIN*3)
		.float 0.0
		.endloop
E_END

		.sect "weights"
		.global	Wx,Wx_END,Wy,Wy_END,Wz,Wz_END
Wx 							; Weight Matrix (10x104)
		.loop (LAGS*CHAN)
		.float 0.001
		.endloop
Wx_END
Wy 
		.loop (LAGS*CHAN)
		.float 0.001
		.endloop
Wy_END
Wz 
		.loop (LAGS*CHAN)
		.float 0.001
		.endloop
Wz_END

		.sect "input"
X		.include "x.asm"	; Input Matrix (100x104)
X_END

*************************************************************************
*	End Data Section
*************************************************************************
	
		.end			; end of file