% Starter script for solving composite optimization problems using
% the Accelerated Composite Gradient Method (ACGM), introduced in:
%
% [FV19] M. I. Florea and S. A. Vorobyov, "An accelerated composite
%        gradient method for large-scale composite objective problems,"
%        IEEE Trans. Signal Process., vol. 67, pp. 444–459, 2019.
%
% [FV20] M. I. Florea and S. A. Vorobyov,
%        "A generalized accelerated composite gradient method:
%        Uniting Nesterov's fast gradient method and FISTA,"
%        IEEE Trans. Signal Process., vol. 68, pp. 3033-3048, 2020.

%   Author: Mihai I. Florea, Aalto University
%   Version: 10 June 2021

% In this script we set up an instance of the l1-regularized logistic
% regression problem (L1LR) as tested in [FV20]. We run monotone ACGM
% and backtracking FISTA and compare the rates of convergence measured
% in iterations using a simple plot.


% WARNING! Running this script closes all the windows, erases all the
% variables from memory and clears the command window.
% If this behavior is not desired, the following 3 commands should be
% commented out.
clc;
clear all;
close all;

% = = = = = = = [SECTION 1] OPTIMIZATION PROBLEM SETUP = = = = = = = %

% The problem instance is loaded from the data file "L1LR.dat".
% This code can be replaced with a custom problem setup procedure
% to suit your application.
fprintf('Setting up the problem...\r');
pb = Problem_L1LR;
data_struct = load('L1LR.dat', 'data', '-mat');
data  = data_struct.data;
pb.m  = data.m;
pb.n  = data.n;
pb.A  = data.A;
pb.y  = data.y;
pb.l1 = data.l1;
pb.l2 = 0.0;

% Set up convexity parameters
pb.set_mu(0.0, 0.0);
pb.L = 0.25 * data.A_s2;
fprintf('done.\r');

% = = = = = = = [SECTION 2] ACGM PARAMETER SETUP = = = = = = = %

% = Non problem-specific algorithm parameters (should not be changed) = %

MAX_BACKTRACKS  = 50;   % Maximum number of backtracks allowed per iteration
ALGO_EPS        = 1e-9; % Machine epsilon when establishing border-case mode
                        % and Lipschitz constant estimate feasibility
L_INC_THRESHOLD = 1.02; % When r_u (see [FV20]) is below this threshold,
                        % backtracking is automatically disabled

% = = = = = Problem-specific algorithm parameters = = = = = %

K        = 200;   % Maximum number of iterations
monotone = 1;     % Monotone mode. Set to 0 to turn off monotonicity
L_0      = pb.L;  % Initial Lipschitz constant estimate

L_inc    = 2.0;   % r_u, see [FV20]
L_dec    = 0.9;   % r_d, see [FV20]

x_0      = data.x_0;          % Starting point, x_0 as per [FV20]
F_0      = pb.F_val(x_0, []); % Composite objective value at
                              % the starting point. Can be infinite.

gamma_0  = 1; % \gamma_0; should always be 1 unless you need to implement
              % a variant of Nesterov's Fast Gradient Method using the
              % ACGM code below
A_0      = 0; % A_0; should be 0 to allow for the infeasibility of x_0.
              % If x_0 is feasible, higher values of A_0 sometimes
              % lead to faster convergence.
mu       = pb.mu;     % \mu, the global strong convexity
                      % parameter (s.c.p.) known to ACGM
mu_Psi   = pb.mu_Psi; % \mu_{Psi}, the s.c.p of the regularizer

% = = = Variables that record algorithm state = = = %

F_ACGM  = zeros(K, 1); % Vector of objective values
% Note that the zeroth iteration is not stored because F(x_0) may be infinite

% (optional) add here initializations for variables that record
% various state parameters, such as the Lipschitz constant estimates

% = = = = = = = [SECTION 3] ACGM IMPLEMENTATION = = = = = = = %

% LIST OF VARIABLES:
%
% VARIABLE NAME | MEANING DURING ITERATION k as per [FV20]
% --------------------------------------------------------
% x_new         | x_{k + 1}
% x_old         | x_{k}
% x_anc         | x_{k - 1}
% y             | y_{k + 1}
% y_step        | y_{k + 1} - (1 / L_{k + 1}) f(y_{k + 1})
% d             | d_{k + 1}
% tau           | 1 / L_{k + 1}
% alpha         | 1 - q_{k} t_{k}^2;
% beta          | \beta_k
% omega         | \omega_k
% q_new         | q_{k + 1}
% q_old         | q_{k}
% t_new         | t_{k + 1}
% t_old         | t_{k}
% L_new         | L_{k + 1}
% L_old         | L_{k}
% F_val_x_new   | F(x_{k + 1})
% F_val_x_old   | F(x_{k})
% f_val_x_new   | f(x_{k + 1})
% f_val_y       | f(y_{k + 1})
% f_grad_y      | \nabla f(y_{k + 1})
% A_new         | A_{k + 1}
% A_old         | A_{k}
% A_scale       | \gamma_0 - A_0 \mu
% --------------------------------------------------------

fprintf('Running ACGM...\r');

% -------- Initialize algorithm state (k = 0) -------- %

x_old       = x_0;
x_anc       = x_0;
F_val_x_old = F_0;
L_old       = L_0;

A_scale     = gamma_0 - A_0 * mu;

% Automatically detect from the input parameters if we are dealing with
% the border case corresponding to gamma_0 == A_0 * mu
border_case = 0; % border_case equals 1 in the border case, 0 otherwise
if (A_scale < 0)
    fprintf('Input parameter error: gamma_0 < A_0 * mu !');
    fprintf('Cannot formulate the convergence guarantee.\r');
    return;
elseif (A_scale < ALGO_EPS) % A_scale is approximately zero
    border_case = 1;
    A_old       = A_0;
end

q_old = mu / (L_old + mu_Psi);
t_old = sqrt((L_old + mu_Psi) * A_0 / gamma_0);

if monotone == 1
    d = zeros(size(x_0));
end

% = = = Iterate = = = %

for k = 1:K                          % k is the iteration counter
    L_new = L_old * L_dec;
    for b = 1:MAX_BACKTRACKS         % b is the backtrack counter
        tau     = 1.0 / L_new;
        alpha   = 1 - q_old * t_old ^ 2;
        q_new   = mu / (L_new + mu_Psi);
        t_new   = 0.5 * ( alpha + sqrt(alpha * alpha + ...
                  4 * (L_new + mu_Psi) / (L_old + mu_Psi) * t_old ^ 2) );
        if monotone == 0
            beta  = ( (t_old - 1) / t_new ) * ...
                    ( (1 - q_new * t_new) / (1 - q_new) );
            y     = x_old + beta * (x_old - x_anc);
        else
            omega =  (1 - q_new * t_new) / ((1 - q_new) * t_new);
            y     = x_old + omega * d;
        end

        f_grad_y    = pb.f_grad(y);
        y_step      = y - tau * f_grad_y;
        x_new       = pb.prox(y_step, tau);
        f_val_x_new = pb.f_val(x_new);
        f_val_y     = pb.f_val(y);

        if (L_inc <= L_INC_THRESHOLD)
            break;
        end
        if pb.LSSC(x_new, y, f_val_x_new, f_val_y, f_grad_y, L_new)
            break;
        else
            % (optional) you may add here code to record backtracks
            L_new = L_new * L_inc;
        end
    end
    if (L_new > max(pb.L * L_inc, L_0) + ALGO_EPS) || (b == MAX_BACKTRACKS)
        fprintf('Error: Line-search failed at iteration [%4d]\r', k);
        return;
    end

    % = = = Update the convergence guarantee = = = %
    if border_case == 0
        A_new = A_scale * (t_new ^ 2) / ...
               ( (L_new + mu_Psi) * (1 - q_new * t_new ^ 2) );
    else
        sqrt_L_p_new = sqrt(L_new + mu_Psi);
        A_new = sqrt_L_p_new / (sqrt_L_p_new - sqrt_mu) * A_old;
        A_old = A_new;
    end

    % = = = Test the monotone condition = = = %
    F_val_x_new = pb.F_val(x_new, f_val_x_new);
    if monotone == 1
        overshoot = (F_val_x_new > F_val_x_old);
        d         = (t_new - (~overshoot)) * (x_new - x_old);
        if overshoot
            % (optional) you may add here code to record overshoots
            x_new       = x_old;
            F_val_x_new = F_val_x_old;
        end
    end

    % = = = Record new algorithm state = = = %

    % (optional) you may add here code to record other algorithm parameters
    F_ACGM(k) = F_val_x_new;

    % = = Update the old state for the next iteration = = %
    x_anc = x_old;
    x_old = x_new;
    if monotone == 1
        F_val_x_old = F_val_x_new;
    end
    q_old = q_new;
    t_old = t_new;
    L_old = L_new;
    A_old = A_new;
end

fprintf('done.\r');
% = = = = = = = =  End of ACGM implementation = = = = = = = = %

% = = = = = = = [SECTION 4] FISTA IMPLEMENTATION = = = = = = = %
% Backtracking FISTA implementation is included for comparison)

fprintf('Running FISTA...\r');

F_FISTA = zeros(K, 1);

x_old   = x_0;
x_anc   = x_0;
y       = x_0;
L_old   = L_0;
t_old   = 1.0;

for k = 1:K
    L_new    = L_old;
    f_grad_y = pb.f_grad(y);

    for b = 1:MAX_BACKTRACKS
        tau         = 1.0 / L_new;
        y_step      = y - tau * f_grad_y;
        x_new       = pb.prox(y_step, tau);
        f_val_x_new = pb.f_val(x_new);
        f_val_y     = pb.f_val(y);

        if (L_inc <= L_INC_THRESHOLD)
            break;
        end
        if pb.LSSC(x_new, y, f_val_x_new, f_val_y, f_grad_y, L_new)
            break;
        else
            % (optional) you may add here code to record backtracks
            L_new = L_new * L_inc;
        end
    end
    if (L_new > max(pb.L * L_inc, L_0) + ALGO_EPS) || (b == MAX_BACKTRACKS)
        fprintf('Error: Line-search failed at iteration [%4d]\r', k);
        return;
    end

    t_new = 0.5 * (1 + sqrt(1 + 4 * t_old * t_old));
    y     = x_new + ((t_old - 1) ./ t_new) .* (x_new - x_old);

    % (optional) you may add here code to record other algorithm parameters
    F_FISTA(k) = pb.F_val(x_new, f_val_x_new);

    % = = Update the old state for the next iteration = = %
    x_anc = x_old;
    x_old = x_new;
    L_old = L_new;
    t_old = t_new;
end

fprintf('done.\r');
% = = = = = = = End of Backtracking FISTA implementation = = = = = = = %

% = = = = = = = [SECTION 5] RESULT DISPLAY = = = = = = = %

fprintf('Plotting the results...\r');

% The optimal value is stored in the data file "L1LR.dat".
% In a custom implementation, this value can be estimated by running
% monotone ACGM for large number of iterations.
F_star = data.f_star;

% The zeroth iteration is added to the plots. It can be removed if the
% starting  point is not known to be feasible.
figure('Name', 'Iteration convergence rate');
semilogy(0:K, [F_0 - F_star; F_FISTA - F_star], ...
    'LineStyle', '-', 'LineWidth', 1.5, 'Color', 'red');
hold on;
semilogy(0:K, [F_0 - F_star; F_ACGM  - F_star], ...
    'LineStyle', '-', 'LineWidth', 1.5, 'Color', 'black');
hold off;
xlabel('Iteration k');
ylabel('F(x_k) - F^*');
legend('FISTA-BT', 'ACGM', 'Location', 'SouthWest');
legend boxoff;

% (optional) you may add here plots related other algorithm state
% parameters such as the Lipschitz constant estimate at each iteration.

fprintf('done.\r');
