% J.C. Spall, Sept. 1998
% twoSGconstrained
% Code for evaluation of second-order SGSA (2SGSA) versus first-order 
% SGSA (1SGSA, as in Chap. 5 of ISSO).  Code is for comparative evaluation purposes; hence,  
% it includes much that is not required for a basic implementation. Further, it is 
% in no way "optimized" for efficiency or generality; this is strictly research
% code for the purpose of getting a basic idea of how 2SGSA works.
%
% Code includes the capability for initializing 2SGSA by running 1SGSA for
% N iterations.  This code allows for averaging of the 
% SG gradients and Hessian estimates at EACH iteration after the 
% initial (N) iterations where only 1SGSA is used for estimating theta.  We use "thetaH"
% for the 2SGSA recursion and "theta" for the 1SGSA recursion.
%
% UPDATE MAR. 2006: Feedback and weighting versions of this code are available from the author;
% this can generally provide enhanced performance. (Reference: Spall, J. C. (2006), “Feedback and Weighting Mechanisms 
% for Improving Jacobian (Hessian) Estimates in the Adaptive Simultaneous Perturbation Algorithm,” Proceedings of the 
% American Control Conference, 14-16 June 2006, Minneapolis, MN, paper ThB09.1 in CD-ROM.)
%
clear all
close all
global p z sigma
p=10;
% meas. noise standard deviation; multplies all elements of N(0,I)noise % vector
sigma=0;
% value of numerator in a_k sequence for all iterations of 1SGSA 
% and first N iterations in the initialization of 2SGSA
aN=20*1.4;
%value of numerator in a_k in 2SGSA (beginning at N+1)
%the number 23.622 below is relevant to N=500 and A=50
%(i.e., it is such that 23.622/(N+1+A)^.501 = 1)
a=.1*23.622;
A=50;
c=.001;
alpha=1; 
gamma=.5;      
n=1000;
N=500;
% number of cases (replications) of 2SGSA and 1SGSA
cases=1;
% number of individual gradient/Hessian estimates to be averaged at each iter.
gH_avg=1;
avg=1;
rand('seed',31415297)
randn('seed',311113)
thetamin=-5*ones(p,1);   %Lower bounds on theta (in unconstrained, set to large neg. no.) 
thetamax=5*ones(p,1);    %Upper bounds on theta (in unconstrained, set to large po
%
%the loop 1:cases below is for doing multiple cases for use in %averaging to
%evaluate the relative performance.  
%
%the loop 1:N below uses the standard 1SG form to initialize 2SGA
%
%lines below initialize various recuresions for the gradient/Hess. %averaging
%and for final error reporting based on the average of the solutions %for 
%"cases" replications.
meanHbar=0;
errtheta=0;
errthetaIA=0;
errthetaH=0;
losstheta=0;
lossthetaIA=0;
lossthetaH=0;
lossthetasq=0;
lossthetaIAsq=0;
lossthetaHsq=0;
tolerancetheta=10;
toleranceloss=0;
truetheta=zeros(p,1);
theta_0=ones(p,1);
loss4thorder(theta_0)  %this is a particular loss function used in some studies (with loss4thgrad)
%DUMMY STATEMENT FOR SETTING DIMENSIONS OF Hhat (AVOIDS OCCASIONAL
%ERROR MESSAGES)
Hhat=eye(p);
for j=1:cases
  caseiter=j
%INITIALIZATION OF PARAMETER AND HESSIAN ESTIMATES
  theta=theta_0;
  thetaH=theta;
  Hbar=.1*eye(p);
%*********2SG********* 
%INITIAL N ITERATIONS OF 1SG PRIOR TO 2SG ITERATIONS
  for k=1:N
    ak=aN/(k+A)^alpha;
    Gk=loss4thgrad(thetaH);
    thetaHlag=thetaH;
    thetaH=thetaH-ak*Gk;
    % Checking for constraints below
    thetaH=min(thetaH,thetamax);
    thetaH=max(thetaH,thetamin);
    if max(abs(thetaHlag-thetaH)) > tolerancetheta;
      thetaH=thetaHlag;
    end
  end
  lossold=0;
  for i=1:avg
    lossold=lossold+loss4thnoise(thetaH);
  end  
%
% 2SG ITERATIONS FOLLOWING INITIALIZATION
%   
  for k=N+1:n
    ak=a/(k+A-N)^alpha;
    ck=c/k^gamma;
    ghatinput=0;
    Hhatinput=0;
% GENERATION OF AVERAGED GRADIENT AND HESSIAN (NO AVERAGING IF 
% gH_avg=1)                 
    for m=1:gH_avg
      delta=2*round(rand(p,1))-1;
      thetaplus=thetaH+ck*delta;
      thetaminus=thetaH-ck*delta;  
      ghatplus=loss4thgrad(thetaplus); 
      ghatminus=loss4thgrad(thetaminus);
% STATEMENT PROVIDING AN AVERAGE OF SP GRAD. APPROXS. PER ITERATION
      ghatinput=((m-1)/m)*ghatinput+loss4thgrad(thetaH)/m;
      deltaghat=ghatplus-ghatminus;
      for i=1:p
        Hhat(:,i)=deltaghat(i)./(2*ck*delta);
      end
      Hhat=.5*(Hhat+Hhat');
      Hhatinput=((m-1)/m)*Hhatinput+Hhat/m; 
    end 
    Hbar=((k-N)/(k-N+1))*Hbar+Hhatinput/(k-N+1);        
%   THE THETA UPDATE (FORM BELOW USES GAUSSIAN ELIMINATION TO AVOID    %   DIRECT COMPUTATION OF HESSIAN INVERSE)
    Hbarbar=sqrtm(Hbar*Hbar+.000001*eye(p)/k);
    thetaHlag=thetaH;
    thetaH=thetaH-ak*(Hbarbar\ghatinput);
    % Checking for constraints below
    thetaH=min(thetaH,thetamax);
    thetaH=max(thetaH,thetamin);
%   Steps below perform "blocking" step with "avg" no. of loss evaluations
    lossnew=0;
    for i=1:avg
      lossnew=lossnew+loss4thnoise(thetaH);
    end
    if lossnew/avg > lossold/avg-toleranceloss;
      thetaH=thetaHlag;
    else
      lossold1=lossold;
      lossold=lossnew;
    end 
    if max(abs(thetaHlag-thetaH)) > tolerancetheta;
      lossold=lossold1;
      thetaH=thetaHlag;
    end    
  end
  thetaH
  Hbar
  loss4thorder(thetaH)
%  
%********1SGSA*************  
% The iterations below are the basic SG iterations.  Uses the same gain sequences
% as the 1:N block above (where 2SGSA is not fully engaged).  Uses same number
% of loss gradient measurements.  Also count L evaluation in 2SGSA to 
% have same cost as grad. evaluation (hence multiplier "4" vs."3" in "for" loops below).
% The overall loop is broken into two parts to accomodate a sliding window of
% the last IA iterates for an iterate averaging solution.
%
  IA=200;                      % amt. of iterate averaging
  for k=1:N+4*(n-N)*gH_avg - IA
    ak=aN/(k+A)^.602;
    Gk=loss4thgrad(theta);
    thetalag=theta;
    theta=theta-ak*Gk;
    % Checking for constraints below
    theta=min(theta,thetamax);
    theta=max(theta,thetamin);
    if max(abs(thetalag-theta)) > tolerancetheta;
      theta=thetalag;
    end    
  end
  thetabar=0;
  for k=N+4*(n-N)*gH_avg - IA+1:N+4*(n-N)*gH_avg 
    ak=aN/(k+A)^.602;
    Gk=loss4thgrad(theta);
    thetalag=theta;
    theta=theta-ak*Gk;
    % Checking for constraints below
    theta=min(theta,thetamax);
    theta=max(theta,thetamin);
    if max(abs(thetalag-theta)) > tolerancetheta;
      theta=thetalag;
    end    
    thetabar=thetabar+theta;
  end
  theta
  thetabar=thetabar/IA;  
  meanHbar=meanHbar+Hbar;
  errtheta=errtheta+(theta-truetheta)'*(theta-truetheta); 
  errthetaIA=errthetaIA+(thetabar-truetheta)'*(thetabar-truetheta);
  errthetaH=errthetaH+(thetaH-truetheta)'*(thetaH-truetheta);
  lossthetasq=lossthetasq+loss4thorder(theta)^2;
  lossthetaIAsq=lossthetaIAsq+loss4thorder(thetabar)^2;
  lossthetaHsq=lossthetaHsq+loss4thorder(thetaH)^2;
  losstheta=losstheta+loss4thorder(theta);
  lossthetaIA=lossthetaIA+loss4thorder(thetabar);
  lossthetaH=lossthetaH+loss4thorder(thetaH);
end
meanHbar/cases
% normalized results of 1SGSA and 2SGSA
norm_theta=((errtheta/cases)^.5)/((theta_0-truetheta)'*(theta_0-truetheta))^.5
norm_thetaIA=((errthetaIA/cases)^.5)/((theta_0-truetheta)'*(theta_0-truetheta))^.5
norm_thetaH=((errthetaH/cases)^.5)/((theta_0-truetheta)'*(theta_0-truetheta))^.5
% standard dev. of mean of normalized loss values; these are by multiplied by 
% (cases/(cases-1))^.5 to account for loss of degree of freedom in standard 
% deviation calculation before using with t-test
if cases > 1;
  (cases^(-.5))*((cases/(cases-1))^.5)*(lossthetasq/(cases*loss4thorder(theta_0)^2)-(losstheta/(cases*loss4thorder(theta_0)))^2)^.5
  (cases^(-.5))*((cases/(cases-1))^.5)*(lossthetaIAsq/(cases*loss4thorder(theta_0)^2)-(lossthetaIA/(cases*loss4thorder(theta_0)))^2)^.5
  (cases^(-.5))*((cases/(cases-1))^.5)*(lossthetaHsq/(cases*loss4thorder(theta_0)^2)-(lossthetaH/(cases*loss4thorder(theta_0)))^2)^.5
end
norm_losstheta=losstheta/(cases*loss4thorder(theta_0))
norm_lossthetaIA=lossthetaIA/(cases*loss4thorder(theta_0))
norm_lossthetaH=lossthetaH/(cases*loss4thorder(theta_0))