****************************************************************************** * (c) 2003 University of Florida * Computational Neuro Engineering Laboratory * www.cnel.ufl.edu * Applied Digital Hardware Research Laboratory * www.add.ece.ufl.edu * * Author: Scott A. Morrison * E-Mail: scott@cnel.ufl.edu * * Advisor: Dr. Jose Principe (CNEL) * E-Mail: principe@cnel.ufl.edu * * Advisor: Dr. Karl Gugel (ADD Lab) * E-Mail: gugel@ecel.ufl.edu * * Date Created: 31 January 2003 * Date Modified: 17 March 2003 * ****************************************************************************** * filename = nlms.asm * Normalized Least Mean Squared with Weight Decay * Purpose: To perform realtime NLMS processing on neural data. * Inputs = 105 Dimensionsl neural data (1 dimension is for bias) * Ouputs = 3 dimensional hand position ****************************************************************************** .text ; program section .include "lms_glb.asm" ; global variables file LMS_START ; begin LMS processing of BMI data LDI 1000h, ST ; clear the cache, just in case OR 00100000000000b,ST ; enable the cache LDP memi_addr ; page becomes internal SRAM OR 00000010b,IOF ; make XF0 Output OR 00100000b,IOF ; make XF1 Output ****************************************************************************** * Initialize address pointers to all Matrices and Vectors * START XOR 01000000b,IOF ; TOGGLE XF1 LDI @X_ADDR,AR3 ; reset x pointer STI AR3,@X_PTR ; start at x(1) LDI @Y_ADDR,AR3 ; reset y pointer ADDI (3*(LAGS-1)),AR3 ; start at y(10) STI AR3,@Y_PTR ; LDI @E_ADDR,AR3 ; reset e pointer ADDI (3*(LAGS-1)),AR3 ; start at e(10) STI AR3,@E_PTR ; LDI @D_ADDR,AR3 ; reset d pointer ADDI (3*(LAGS-1)),AR3 ; start at d(10) STI AR3,@D_PTR ; ****************************************************************************** * Prepare for LMS Loop * LDI (WIN-LAGS),AR0 STI AR0,@MYCOUNT ; Load main counter with 90 (=100-10) ****************************************************************************** * Calculate the Output Y(n)=W(n)'*X(n) * LOOP1 XOR 00000100b,IOF ; TOGGLE XF0 for timing LDI @X_PTR,AR3 ; Load current X pointer LDI @Wx_ADDR,AR4 ; AR4 <= Address of Wx LDI @Wy_ADDR,AR5 ; AR5 <= Address of Wy LDI @Wz_ADDR,AR6 ; AR6 <= Address of Wz LDF 0.0,R0 ; Clear R0 LDF 0.0,R1 ; Clear R1 LDF 0.0,R2 ; Clear R2 LDF 0.0,R3 ; Clear R3 LDF 0.0,R4 ; Clear R4 LDF 0.0,R5 ; Clear R5 STF R0,@POWER ; Zero the power cumulation LDI (LAGS*CHAN-1),RC ; Get ready to repeat 10*104 times RPTB HERE1 ; Repeat this block to calculate output MPYF3 *AR4++,*AR3,R0 ; Wx*X parallel multiply and accumulate || ADDF3 R0,R2,R2 MPYF3 *AR5++,*AR3,R1 ; Wy*Y parallel multiply and accumulate || ADDF3 R1,R3,R3 MPYF3 *AR6++,*AR3,R4 ; Wz*Z non-parallel multiply and accumulate ADDF R4,R5 MPYF3 *AR3,*AR3++,R7 ; accumulate the power of the input ADDF @POWER,R7 HERE1 STF R7,@POWER ADDF R0,R2 ; do last accumulate ADDF R1,R3 ; do last accumulate ; last accumulate not necessary for Wz * We went through the whole tap-input, now store results to Y(n) LDI @Y_PTR,AR0 ; load Y pointer STF R2,*AR0++ ; store Yx(n) STF R3,*AR0++ ; store Yy(n) STF R5,*AR0++ ; store Yz(n) STI AR0,@Y_PTR ; store Y pointer back to memory for next time ; next, error calculation will use Y from R2,R3,R5 ****************************************************************************** * Calculate the Error E(n)=D(n)-Y(n) * LDI @E_PTR,AR0 ; AR0 --> e(n) LDI @D_PTR,AR6 ; AR6 --> d(n) SUBF R2,*AR6++,R7 ; ex(n) = dx(n)-yx(n) STF R7,*AR0++ SUBF R3,*AR6++,R7 ; ey(n) = dy(n)-yy(n) STF R7,*AR0++ SUBF R5,*AR6++,R7 ; ez(n) = dz(n)-yz(n) STF R7,*AR0++ STI AR6,@D_PTR ; store changed D pointer for next time ; don't store new E pointer, need it later ****************************************************************************** * Update the Weight Matrix * * Find 1/(1+q) LDF @POWER,R0 ; load the power of x(n) ADDF 1.0,R0 ; R0=R0+1.0 (R0=q+1) CALL FPINV ; R0=1/R0 (R0=1/(1+q)) STF R0,@POWER ; store back to memory LDI @X_PTR,AR3 ; Load current X pointer LDI @Wx_ADDR,AR4 ; AR4 --> Wx LDI @Wy_ADDR,AR5 ; AR5 --> Wy LDI @Wz_ADDR,AR6 ; AR6 --> Wz LDI @E_PTR,AR7 ; AR7 --> e(n) LDF *AR7++,R3 ; R3 <= ex(n) LDF *AR7++,R4 ; R4 <= ey(n) LDF *AR7++,R5 ; R5 <= ez(n) STI AR7,@E_PTR ; save new E pointer to memory LDF @POWER,R6 ; R6 < = 1/(1+q) MPYF @ETA,R6 ; R6 <= ETA * 1/(1+q) LDF @DECAY,R7 ; R7 <= (1-DECAY) LDI (LAGS*CHAN-1),RC ; Get ready to repeat 10*104 times RPTB HERE2 ; Repeat this block to update weights LDF *AR3++,R2 ; R1 <= x(n) MPYF3 R2,R3,R0 ; R0 <= x(n)*ex(n) MPYF R6,R0 ; R0 <= ETA * 1/(1+q) * x(n)*ex(n) MPYF3 R7,*AR4,R1 ; R1 <= (1-DECAY) * Wx ADDF R1,R0 ; R0 <= (1-DECAY) * Wx + ETA * 1/(1+q) * x(n)*ex(n) STF R0,*AR4++ ; update weight in memory, increment pointer MPYF3 R2,R4,R0 ; R0 <= x(n)*ey(n) MPYF R6,R0 ; R0 <= ETA * 1/(1+q) * x(n)*ey(n) MPYF3 R7,*AR5,R1 ; R1 <= (1-DECAY) * Wy ADDF R1,R0 ; R0 <= (1-DECAY) * Wy + ETA * 1/(1+q) * x(n)*ey(n) STF R0,*AR5++ ; update weight in memory, increment pointer MPYF3 R2,R5,R0 ; R0 <= x(n)*ez(n) MPYF R6,R0 ; R0 <= ETA * 1/(1+q) * x(n)*ez(n) MPYF3 R7,*AR6,R1 ; R1 <= (1-DECAY) * Wz ADDF R1,R0 ; R0 <= (1-DECAY) * Wz + ETA * 1/(1+q) * x(n)*ez(n) HERE2 STF R0,*AR6++ ; update weight in memory, increment pointer LDI @X_PTR,AR0 ; load current X pointer ADDI (CHAN),AR0 ; move pointer to next input time STI AR0,@X_PTR ; store X pointer for next use ****************************************************************************** * Done with this iteration, loop back for next iteration * LDI @MYCOUNT,AR0 ; retrieve counter from memory SUBI 1,AR0 CMPI 0,AR0 BGED LOOP1 STI AR0,@MYCOUNT ; put counter back into memory NOP NOP * Delayed Branch happens here! BR START ; Loop forever for timing RETSU ; return to DSP OS LMS_END ; end of LMS routine ************************************************************************* ************************* END OF EXECUTABLE CODE ************************ ************************************************************************* ************************************************************************* * Stored in Internal Memory ************************************************************************* .data ; initialized data section MSK .word 0FF7FFFFFH ; for fp_invert routine FP_TABLE .word 0FF800000h ; table for C33FP <==> IEEEFP conversion .word 0FF000000h .word 07F000000h .word 080000000h .word 081000000h .word 07F800000h .word 000400000h .word 0007FFFFFh .word 07F7FFFFFh * Global declarations for LMS data .global X,X_END,Y,Y_END,E,E_END,D,D_END,X_PTR,MYCOUNT WIN .set 200 ; Window size (number of 300ms time samples) LAGS .set 10 ; Number of time lags for each computation CHAN .set (104+1) ; Number of neuron channels (1 is for bias) WINIT .set 0.001 ; Weight initialization MYCOUNT .word 0 ; Main counter for LMS loop Wx_ADDR .word Wx ; Constant pointer to top of Wx Wy_ADDR .word Wy ; Constant pointer to top of Wy Wz_ADDR .word Wz ; Constant pointer to top of Wz X_ADDR .word X ; Constant pointer to top of X Y_ADDR .word Y ; Constant pointer to top of Y D_ADDR .word D ; Constant pointer to top of D E_ADDR .word E ; Constant pointer to top of E X_PTR .word X ; Changeable pointer to X D_PTR .word D ; Changeable pointer to D Y_PTR .word Y ; Changeable pointer to Y E_PTR .word E ; Changeable pointer to E POWER .float 0.0 ; Power of the input at each time ETA .float 0.25 ; LMS step size DECAY .float (1-0.005) ; Weight decay parameter Y ; Output Vector, initialize to zeros .loop (WIN*3) .float 0.0 .endloop Y_END D .include "d.asm" ; Desired Vector, monkey data E ; Error Vector, initialize to zeros .loop (WIN*3) .float 0.0 .endloop E_END .sect "weights" .global Wx,Wx_END,Wy,Wy_END,Wz,Wz_END Wx ; Weight Matrix (10x104) .loop (LAGS*CHAN) .float 0.001 .endloop Wx_END Wy .loop (LAGS*CHAN) .float 0.001 .endloop Wy_END Wz .loop (LAGS*CHAN) .float 0.001 .endloop Wz_END .sect "input" X .include "x.asm" ; Input Matrix (100x104) X_END ************************************************************************* * End Data Section ************************************************************************* .end ; end of file