From 3e7facaa84a719b0b59a6ef19bad1d63c767bd56 Mon Sep 17 00:00:00 2001 From: Dominique Orban Date: Sat, 19 Apr 2025 10:58:17 -0400 Subject: [PATCH] results for R2 --- paper/defs.sty | 2 +- paper/report.bib | 91 +++++++++ paper/report.tex | 502 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 584 insertions(+), 11 deletions(-) diff --git a/paper/defs.sty b/paper/defs.sty index d6d8e98..78360e2 100644 --- a/paper/defs.sty +++ b/paper/defs.sty @@ -58,7 +58,7 @@ %\renewcommand{\year}{2020} % If you don't want the current year. \newcommand{\cahiernumber}{00} % Insert your Cahier du GERAD number. -\usepackage[margin=1in]{geometry} +\usepackage[margin=1.5in]{geometry} \usepackage[T1]{fontenc} \usepackage{amsthm} % must come before newpxtext and newpxmath \usepackage{amssymb} diff --git a/paper/report.bib b/paper/report.bib index e69de29..a21cf8e 100644 --- a/paper/report.bib +++ b/paper/report.bib @@ -0,0 +1,91 @@ +@Article{ aravkin-baraldi-orban-2022, + Author = {Aravkin, A Y and Baraldi, R and Orban, D}, + Title = {A Proximal Quasi-{N}ewton Trust-Region Method for Nonsmooth Regularized Optimization}, + Journal = siopt, + Year = 2022, + Volume = 32, + Number = 2, + Pages = {900--929}, + doi = {10.1137/21M1409536}, + abstract = { We develop a trust-region method for minimizing the sum of a smooth term (f) and a nonsmooth term (h), both of which can be nonconvex. Each iteration of our method minimizes a possibly nonconvex model of (f + h) in a trust region. The model coincides with (f + h) in value and subdifferential at the center. We establish global convergence to a first-order stationary point when (f) satisfies a smoothness condition that holds, in particular, when it has a Lipschitz-continuous gradient, and (h) is proper and lower semicontinuous. The model of (h) is required to be proper, lower semi-continuous and prox-bounded. Under these weak assumptions, we establish a worst-case (O(1/\epsilon^2)) iteration complexity bound that matches the best known complexity bound of standard trust-region methods for smooth optimization. We detail a special instance, named TR-PG, in which we use a limited-memory quasi-Newton model of (f) and compute a step with the proximal gradient method, + resulting in a practical proximal quasi-Newton method. We establish similar convergence properties and complexity bound for a quadratic regularization variant, named R2, and provide an interpretation as a proximal gradient method with adaptive step size for nonconvex problems. R2 may also be used to compute steps inside the trust-region method, resulting in an implementation named TR-R2. We describe our Julia implementations and report numerical results on inverse problems from sparse optimization and signal processing. Both TR-PG and TR-R2 exhibit promising performance and compare favorably with two linesearch proximal quasi-Newton methods based on convex models. }, +} + +@article{aravkin-baraldi-orban-2024, + author = {Aravkin, Aleksandr Y. and Baraldi, Robert and Orban, Dominique}, + title = {A {L}evenberg–{M}arquardt Method for Nonsmooth Regularized Least Squares}, + journal = sisc, + volume = {46}, + number = {4}, + pages = {A2557--A2581}, + year = {2024}, + doi = {10.1137/22M1538971}, + preprint = {https://www.gerad.ca/en/papers/G-2022-58/view}, + abstract = { Abstract. We develop a Levenberg–Marquardt method for minimizing the sum of a smooth nonlinear least-squares term \(f(x) = \frac{1}{2} \|F(x)\|\_2^2\) and a nonsmooth term \(h\). Both \(f\) and \(h\) may be nonconvex. Steps are computed by minimizing the sum of a regularized linear least-squares model and a model of \(h\) using a first-order method such as the proximal gradient method. We establish global convergence to a first-order stationary point under the assumptions that \(F\) and its Jacobian are Lipschitz continuous and \(h\) is proper and lower semicontinuous. In the worst case, our method performs \(O(\epsilon^{-2})\) iterations to bring a measure of stationarity below \(\epsilon \in (0, 1)\). We also derive a trust-region variant that enjoys similar asymptotic worst-case iteration complexity as a special case of the trust-region algorithm of Aravkin, Baraldi, and Orban [SIAM J. Optim., 32 (2022), pp. 900–929]. We report numerical results on three examples: a group-lasso basis-pursuit denoise example, a nonlinear support vector machine, and parameter estimation in a neuroscience application. To implement those examples, we describe in detail how to evaluate proximal operators for separable \(h\) and for the group lasso with trust-region constraint. In all cases, the Levenberg–Marquardt methods perform fewer outer iterations than either a proximal gradient method with adaptive step length or a quasi-Newton trust-region method, neither of which exploit the least-squares structure of the problem. Our results also highlight the need for more sophisticated subproblem solvers than simple first-order methods. } +} + +@TechReport{ aravkin-baraldi-leconte-orban-2021, + Author = {Aravkin, Aleksandr Y. and Baraldi, Robert and Leconte, Geoffroy and Orban, Dominique}, + Title = {Corrigendum: A proximal quasi-{N}ewton trust-region method for nonsmooth regularized optimization}, + Institution = gerad, + Year = 2024, + Type = {Cahier}, + Number = {G-2021-12-SM}, + Address = gerad-address, + Pages = {1--3}, + doi = {10.13140/RG.2.2.36250.45768}, +} + +@Book{ cartis-gould-toint-2022, + Author = {Cartis, Coralia and Gould, Nicholas I. M. and Toint, {\relax Ph}ilippe L}, + Title = {Evaluation Complexity of algorithms for nonconvex optimization}, + Publisher = siam, + Year = 2022, + Series = {MOS-SIAM Series on Optimization}, + Address = siam-address, + doi = {10.1137/1.9781611976991}, + Number = 30, +} + +@techreport{diouane-habiboullah-orban-2024a, + author = {Y. Diouane and M. L. Habiboullah and D. Orban}, + pages = {}, + title = {Complexity of trust-region methods in the presence of unbounded Hessian approximations}, + year = {2024}, + number = {G-2024-43}, + type = {Cahier}, + institution = gerad, + address = gerad-address, + doi = {}, + preprint = {https://www.gerad.ca/en/papers/G-2024-43}, +} + +@techreport{diouane-habiboullah-orban-2024b, + author = {Y. Diouane and M. L. Habiboullah and D. Orban}, + pages = {}, + title = {A Proximal Modified Quasi-{N}ewton Method for Nonsmooth Regularized Optimization}, + year = {2024}, + number = {G-2024-64}, + type = {Cahier}, + institution = gerad, + address = gerad-address, + doi = {10.13140/RG.2.2.21140.51840}, + preprint = {https://www.gerad.ca/en/papers/G-2024-64}, +} + +@incollection{wright-2018, + author = {S. J. Wright}, + title = {Optimization Algorithms for Data Analysis}, + booktitle = {The Mathematics of Data}, + publisher = ams, + editor = {M. W. Mahoney and J. C. Duchi and A. C. Gilbert}, + volume = {25}, + number = {}, + series = {IAS/Park City Mathematics Series}, + chapter = {2}, + pages = {49--98}, + address = ams-address, + edition = {1st}, + year = {2018}, + doi = {10.1090/pcms/025/00830}, +} diff --git a/paper/report.tex b/paper/report.tex index 7e0fcae..5f9290f 100644 --- a/paper/report.tex +++ b/paper/report.tex @@ -1,8 +1,16 @@ \documentclass[10pt]{article} \usepackage{defs} +% https://tex.stackexchange.com/a/4881/2701 +\makeatletter +\newcommand*{\eqdef}{\mathrel{\vcenter{\baselineskip0.5ex \lineskiplimit0pt + \hbox{\scriptsize.}\hbox{\scriptsize.}}}% + =} +\makeatother +\newcommand{\skcp}{s_{k, \textup{cp}}} + \newcommand{\papertitle}{% - Insert Your Title Here + Complexity of Methods for General Optimization in the Presence of Convexity } % For debugging. @@ -11,9 +19,9 @@ % Meta-information for the PDF file generated.. \hypersetup{ pdftitle={\papertitle}, - pdfauthor={Author One and Author Two}, - pdfsubject={Report Subject}, - pdfkeywords={Keyword1, keyword2, keyword3}, + pdfauthor={Youssef Diouane and Mohamed Laghdaf Habiboullah and Dominique Orban}, + pdfsubject={Wost-Case Evaluation Complexity}, + pdfkeywords={complexity, convexity}, } % \usepackage{newunicodechar} @@ -21,8 +29,13 @@ \title{\papertitle} \author{% - Author One\footnote{% - GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:geoffroy.leconte@polymtl.ca}{geoffroy.leconte@polymtl.ca}. + Youssef Diouane\footnote{% + GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:youssef.diouane@polymtl.ca}{youssef.diouane@polymtl.ca}. + } + \thanks{Research supported by an NSERC Discovery grant.} + \and + Mohamed L. Habiboullah\footnote{% + GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:mohamed.habiboullah@polymtl.ca}{mohamed.habiboullah@polymtl.ca}. } \and Dominique Orban\footnote{% @@ -38,6 +51,11 @@ \thispagestyle{mytitlepage} \begin{abstract} + Methods for unconstrained convex optimization are typically more straightfoward than methods designed for the same problem class but in which the objective may be nonconvex. + Accordingly, the former benefit from stronger convergence results and more favorable worst-case complexity results. + But do methods designed for general problems have worse worst-case complexity when applied to convex problems than methods designed for convex problems? + If so, how much worse? + In this research, we study and provide answers to these legitimate questions. \end{abstract} % Résumé en français pour le Cahier du GERAD @@ -46,21 +64,477 @@ \pagestyle{myheadings} +\tableofcontents + \section{Introduction}% \label{sec:introduction} +We consider the unconstrained problem +\begin{equation}% + \label{eq:nlo} + \minimize{x \in \R^n} \ f(x), +\end{equation} +where \(f: \R^n \to \R\) is \(C^1\), and study the worst-case evaluation complexity of methods initially designed for general \(f\) when applied to convex \(f\). +We assume that \(\nabla f\) is Lipschitz-continuous with constant \(L \geq 0\). + \subsection*{Related research} \subsection*{Notation} +For a set \(\mathcal{S}\), we denote \(|\mathcal{S}|\) its cardinality. + \section{Background}% \label{sec:background} -\section{Another Section}% -\label{sec:another-section} +\citet{cartis-gould-toint-2022} and \citet{diouane-habiboullah-orban-2024a} study the worst-case evaluation complexity of trust-region and regularization methods when applied to convex \(f\) under the assumption that there exists \(R \geq 1\) such that the level set \(\{x \in \R^n \mid f(x) \leq f(x_0)\}\) is contained within the ball centered at \(x_\star\) of radius \(R\). +Such bounded level set assumption is not particularly related to strict of strong convexity. +For instance, the function of one variable +\[ + f(x) = + \begin{cases} + x^2 & \text{ if } x < 0 \\ + 0 & \text{ if } x \geq 0, + \end{cases} +\] +has a well-defined \(x_\star = 0\) (or, indeed, any \(x \geq 0\)), is convex but not strictly convex, but does not satisfy the assumption. +The function \(f(x) = x^4\) is strictly, but not strongly, convex and satisfies the assumption. +The function \(f(x) = |x|\) is not strictly convex but satisfies the assumption. +Any smooth and strongly convex \(f\) for which a (global) minimum is attained satsifies the assumption. +Indeed, for such \(f\), there exists \(\mu > 0\) such that, for all \(y \in \R^n\), +\[ + f(y) \geq f(x_\star) + \nabla f(x_\star)^T (y - x_\star) + \tfrac{1}{2} \mu \|y - x_\star\|^2 = f(x_\star) + \tfrac{1}{2} \mu \|y - x_\star\|^2. +\] +Thus, if \(x\) is such that \(f(x) \leq f(x_0)\), +\[ + \tfrac{1}{2} \mu \|x - x_\star\|^2 \leq f(x) - f(x_\star) \leq f(x_0) - f(x_\star), +\] +so that we may pick \(R = \max(1, \, \sqrt{2 (f(x_0) - f(x_\star)) / \mu})\). + +If \(f\) is strongly convex with modulus \(\mu > 0\), it possesses a unique minimizer \(x_\star\) \citep[Theorem~\(3.3.14\)]{wright-2018}. +For all \(x\), \(y \in \R^n\), +\[ + f(y) \geq f(x) + \nabla f(x)^T (y - x) + \tfrac{1}{2} \mu \|y - x\|^2. +\] +We minimize each side with respect to \(y\), note that the minimum of the right-hand side is attained for \(y = x - \mu^{-1} \nabla f(x)\), and obtain +\[ + f(x_\star) \geq f(x) - \tfrac{1}{2} \mu^{-1} \|\nabla f(x)\|^2. +\] +Thus, for all \(x \in \R^n\), +\begin{equation}% + \label{eq:strongly-convex-gradient} + \|\nabla f(x)\|^2 \geq 2 \mu (f(x) - f(x_\star)). +\end{equation} + +In addition, if \(f\) is strongly convex with modulus \(\mu > 0\) and \(\nabla f\) is Lipschitz-continuous with constant \(L \geq 0\), then \(L \geq \mu\). +Indeed, for all \(x\), \(s \in \R^n\), +\begin{equation}% + \label{eq:strongly-convex-quadratic-bounds} + f(x) + \nabla f(x)^T s + \tfrac{1}{2} \mu \|s\|^2 \leq f(x + s) \leq f(x) + \nabla f(x)^T s + \tfrac{1}{2} L \|s\|^2. +\end{equation} +If \(x = x_\star\) is a stationary point, hence a global minimizer,~\eq can be written +\[ + \tfrac{1}{2} \mu \|x - x_\star\|^2 \leq f(x) - f(x_\star) \leq \tfrac{1}{2} L \|x - x_\star\|^2 + \quad \text{for all} \quad x \in \R^n. +\] + +\section{Preliminary Results}% +\label{sec:prelims} + +Because \(\nabla f\) is Lipschitz-continuous, for all \(x\), \(s \in \R^n\), +\begin{equation}% + \label{eq:lipschitz} + |f(x + s) - f(x) - \nabla f(x)^T s| \leq \tfrac{1}{2} L \|s\|^2. +\end{equation} + +Consider a method that generates iterates according to \(x_{k+1} = x_k + s_k\) where \(s_k\) is the step at iteration \(k\). +Let \(x_\star\) be fixed. +We will use the identity +\begin{equation}% + \label{eq:diff-distance} + \|x_k - x_\star\|^2 - \|x_{k+1} - x_\star\|^2 = + \|x_k - x_\star\|^2 - \|x_k + s_k - x_\star\|^2 = + -s_k^T (x_k - x_\star) - \|s_k\|^2 +\end{equation} +repeatedly. + +We consider methods for~\eqref{eq:nlo} that generate \(\{x_k\}\) such that \(\{f(x_k)\}\) is nonincreasing. +Because not all steps are accepted, we denote \(q_k\) the index of the \(k\)-th iteration where the step is accepted, i.e., \(x_{q_k + 1} = x_{q_k} + s_{q_k}\). +Such an iteration is called \emph{successful}. +Note that \(q_{k+1} \geq q_k + 1\), and therefore, \(\{q_k\}\) is increasing. +On a successful iteration \(k\), \(f(x_{k+1}) < f(x_k)\). +On an \emph{unsuccessful} iteration \(k\), the step \(s_k\) is rejected and \(x_{k+1} = x_k\). +Therefore, +\begin{equation}% + \label{eq:qk-identity} + x_{q_k + 1} = x_{q_{k + 1}} + \quad (k \in \N). +\end{equation} +We refer to a method possessing the features above as a \emph{descent} method. + +The following result appears inside the proof of \citep[Theorem~\(4.3.1\)]{wright-2018} but does not require convexity of \(f\). + +\begin{lemma}% + \label{lem:1/N-complexity} + Consider a descent method for~\eqref{eq:nlo}. + Let \(x_\star \in \R^n\) be such that \(f(x_k) \geq f(x_\star)\) for all \(k\). + Assume that there exists a constant \(C > 0\) such that, for all \(k\), + \begin{equation}% + \label{eq:decrease} + f(x_{q_k + 1}) \leq f(x_\star) + C (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2). + \end{equation} + For all integer \(N > 0\), + \begin{equation}% + \label{eq:1/N-complexity} + f(x_{q_N}) - f(x_\star) \leq \frac{C}{N} \|x_0 - x_\star\|^2. + \end{equation} +\end{lemma} + +\begin{proof} + Let \(N > 0\). + We sum~\eqref{eq:decrease} over \(k = 0, \ldots, N-1\), use~\eqref{eq:qk-identity} to recover a telescoping sum, and obtain + \begin{align*} + \sum_{k=0}^{N-1} (f(x_{q_k+1}) - f(x_\star)) & \leq C \sum_{k=0}^{N-1} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2) + \\ & = C \sum_{k=0}^{N-1} (\|x_{q_k} - x_\star\|^2 - \|x_{q_{k + 1}} - x_\star\|^2) + \\ & = C (\|x_0 - x_\star\|^2 - \|x_{q_N} - x_\star\|^2) + \\ & \leq C \|x_0 - x_\star\|^2, + \end{align*} + The result follows by noting that because \(\{f(x_k)\}\) is nonincreasing, + \begin{align*} + f(x_{q_N}) - f(x_\star) \leq \frac{1}{N} \sum_{k=0}^{N-1} (f(x_{q_k + 1}) - f(x_\star)). + \tag*{\qedhere} + \end{align*} +\end{proof} + +\section{R2}% +\label{sec:r2} + +It is instructive to begin with a first-order method. +The main differences between the analysis of R2 and that of \citet[\S\(4.3\)]{wright-2018} are that R2 is not a fixed-steplength method and not all steps are accepted, i.e., sufficient decrease may not occur at every iteration. +The main advantage of R2 over fixed-steplength methods is that it removes the need to know the Lipschitz constant of \(\nabla f\). +At each iteration, R2 forms the first-order approximation +\begin{equation}% + \label{eq:phi-r2} + \varphi(s; x_k) \eqdef f(x_k) + \nabla f(x_k)^T s \approx f(x_k + s), +\end{equation} +and computes a step by minimizing the model +\begin{equation}% + \label{eq:model-r2} + m(s; x_k, \sigma_k) \eqdef \varphi(s; x_k) + \tfrac{1}{2} \sigma_k \|s\|^2, +\end{equation} +where \(\sigma_k > 0\) is a regularization parameter. +The situation here is sufficiently simple that the explicit formula +\begin{equation}% + \label{eq:step-r2} + s_k \eqdef -\sigma_k^{-1} \nabla f(x_k) +\end{equation} +emerges for the step. +The R2 algorithm is stated as \Cref{alg:r2}. + +\begin{algorithm} + \caption[caption]{% + R2: Quadratic Regularization.% + \label{alg:r2} + } + \begin{algorithmic}[1] + \State Choose constants \(0 < \eta_1 \leq \eta_2 < 1\) and \(0 < \gamma_3 \leq 1 < \gamma_1 \leq \gamma_2\). + \State% + \label{alg:r2-init} + Choose \(x_0 \in \R^n\), \(\sigma_0 > 0\), compute \(f(x_0) + h(x_0)\). + \For{\(k = 0, 1, \dots\)} + \State Compute the step \(s_k \eqdef -\sigma_k^{-1} \nabla f(x_k)\). + \State Compute the ratio + \[ + \rho_k := + \frac{ + f(x_k) - f(x_k + s_k) + }{ + \varphi(0; x_k) - \varphi(s_k; x_k) + }. + \] + \State If \(\rho_k \geq \eta_1\), set \(x_{k+1} = x_k + s_k\). Otherwise, set \(x_{k+1} = x_k\). + \State% + \label{alg:r2-update-sigma} + Update the regularization parameter according to + \[ + \sigma_{k+1} \in + \begin{cases} + \begin{aligned} + & [\gamma_3 \sigma_k, \, \sigma_k] & & \text{ if } \rho_k \geq \eta_2, & & \quad \text{very successful iteration} \\ + & [\sigma_k, \, \gamma_1 \sigma_k] & & \text{ if } \eta_1 \leq \rho_k < \eta_2, & & \quad \text{successful iteration} \\ + & [\gamma_1 \sigma_k, \, \gamma_2 \sigma_k] & & \text{ if } \rho_k < \eta_1. & & \quad \text{unsuccessful iteration} + \end{aligned} + \end{cases} + \] + \EndFor + \end{algorithmic} +\end{algorithm} + +\subsection{Worst-Case Complexity for Convex \(\boldmath f\)} + +In the denominator of \(\rho_k\), note that +\begin{equation}% + \label{eq:diff-phi-r2} + \varphi(0; x_k) - \varphi(s_k; x_k) = \sigma_k^{-1} \|\nabla f(x_k)\|^2 = \sigma_k \|s_k\|^2. +\end{equation} + +Under Lipschitz-continuity of \(\nabla f\), \citet[Theorem~\(6.2\)]{aravkin-baraldi-orban-2022} establish that\footnote{By~\eqref{eq:lipschitz}, the constant \(\kappa_{\mathrm{m}}\) in \citet[Theorem~\(6.2\)]{aravkin-baraldi-orban-2022} can be taken as \(\tfrac{1}{2} L\). +See also \citet{aravkin-baraldi-leconte-orban-2021}.} +\begin{equation}% + \label{eq:sigma_max-r2} + \sigma_k \leq \sigma_{\max} \eqdef \max(\sigma_0, \frac{\gamma_2}{1 - \eta_2} L) > 0 \quad \text{for all } k \in \N. +\end{equation} + +\begin{theorem}% + \label{thm:complexity-r2-convex} + Assume that \(f\) is convex, that there exists a (global) minimizer \(x_\star\) and that \(\eta_1 \geq \tfrac{1}{2}\) in \Cref{alg:r2}. + The sequence \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing and for all integer \(N > 0\), + \begin{equation}% + \label{eq:1/N-complexity-r2} + f(x_{q_N}) - f(x_\star) \leq \frac{\sigma_{\max}}{2N} \|x_0 - x_\star\|^2. + \end{equation} +\end{theorem} + +\begin{proof} + Let \(k \in \N\) and consider successful iteration \(q_k\). + The mechanism of \Cref{alg:r2},~\eqref{eq:diff-phi-r2} and our assumption that \(\eta_1 \geq \tfrac{1}{2}\) imply that + \[ + f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 (\varphi(0; x_{q_k}) - \varphi(s_{q_k}; x_{q_k})) \geq \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2. + \] + By convexity of \(f\), \(f(x_\star) \geq f(x_{q_k}) + \nabla f(x_{q_k})^T (x_\star - x_{q_k})\). + Thus, + \begin{align*} + f(x_{q_k + 1}) & \leq f(x_{q_k}) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2 + \\ & \leq f(x_\star) + \nabla f(x_{q_k})^T (x_{q_k} - x_\star) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2 + \\ & = f(x_\star) - \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2. + \end{align*} + We may now use~\eqref{eq:diff-distance} and~\eqref{eq:sigma_max-r2} to obtain + \[ + f(x_{q_k + 1}) \leq f(x_\star) + \tfrac{1}{2} \sigma_{\max} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2). + \] + The majoration using~\eqref{eq:sigma_max-r2} is valid because the factor of \(\sigma_{q_k}\) above is + \[ + \tfrac{1}{2} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2) \geq \frac{ f(x_{q_k + 1}) - f(x_\star) }{ \sigma_{q_k} } \geq 0, + \] + which also shows that \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing. + The result follows from the application of \Cref{lem:1/N-complexity} with \(C = \tfrac{1}{2} \sigma_{\max}\). +\end{proof} + +Because \Cref{thm:complexity-r2-convex} requires \(\eta_1 \geq \tfrac{1}{2}\) and the convergence analysis of \Cref{alg:r2} requires \(\eta_1 < \eta_2 < 1 < \gamma_2\), the most favorable constant in the right-hand side of~\eqref{eq:1/N-complexity-r2} occurs when \(\eta_2\) is close to \(\tfrac{1}{2}\) and \(\gamma_2\) is close to \(1\), in which case \(\sigma_{\max}\) is close to \(2L\). +Thus \Cref{thm:complexity-r2-convex} suggests that the complexity of \Cref{alg:r2} could be worse than that of the constant-steplength gradient method by a factor of at least two. +In addition, we must take unsuccessful iterations into account. -\section{Implementation and numerical experiments}% -\label{sec:numerical} +At a general iteration \(k\), let \(j(k) \eqdef \max \{i \in \N \mid q_i \leq k\}\) be the number of successful iterations so far. +For each unsuccessful iteration \(i\), \(\sigma_{i + 1} \geq \gamma_1 \sigma_i\) while for any successful iteration \(i\), \(\sigma_{i + 1} \geq \gamma_3 \sigma_i\). +Thus, +\[ + \sigma_0 \gamma_1^{k - q_{j(k)}} \gamma_3^{q_{j(k)}} \leq \sigma_k \leq \sigma_{\max}, +\] +and therefore, taking logarithms on both sides, the number of unsuccessful iterations so far is +\begin{equation}% + \label{eq:num-unsuccessful} + k - q_{j(k)} \leq \frac{ \log(\sigma_{\max} / \sigma_0) }{ \log(\gamma_1) } + q_{j(k)} \frac{ |\log(\gamma_3)| }{ \log(\gamma_1) } = \log_{\gamma_1}(\sigma_{\max} / \sigma_0) + q_{j(k)} |\log_{\gamma_1}(\gamma_3)|. +\end{equation} + +It is possible to do away with the assumption that \(\eta_1 \geq \tfrac{1}{2}\) in \Cref{thm:complexity-r2-convex} provided we introduce a minimum acceptable value of \(\sigma_k\). +For a given \(\sigma_{\min} > 0\), assume we choose \(\sigma_0 \geq \sigma_{\min}\) at \Cref{alg:r2-init} and update \(\sigma_{k+1} \in [(\max(\sigma_{\min}, \, \gamma_3 \sigma_k), \, \sigma_k]\) at \Cref{alg:r2-update-sigma} of \Cref{alg:r2}. +We then ensure that +\begin{equation}% + \label{eq:sigma-min} + \sigma_{\min} \leq \sigma_k \leq \sigma_{\max} \quad \text{for all } k \in \N. +\end{equation} +Accordingly, we obtain the following bound. + +\begin{theorem}% + \label{thm:complexity-r2-convex-sigma-min} + Assume that \(f\) is convex, that there exists a (global) minimizer \(x_\star\) and that \Cref{alg:r2} has been modified to enforce~\eqref{eq:sigma-min}. + For all integer \(N > 0\), + \begin{equation}% + \label{eq:1/N-complexity-r2-sigma-min} + f(x_{q_N}) - f(x_\star) \leq \frac{ \sigma_{\max} }{ N } \left( \|x_0 - x_\star\|^2 + \frac{ (1 - \eta_1) }{ \eta_1 \sigma_{\min} } (f(x_0) - f(x_\star)) \right). + \end{equation} +\end{theorem} + +\begin{proof} + Let \(k \in \N\) and consider successful iteration \(q_k\). + The mechanism of \Cref{alg:r2} implies that + \[ + f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k} \|s_{q_k}\|^2 = \sigma_{q_k} \|s_{q_k}\|^2 - (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2. + \] + We use convexity as in the proof of \Cref{thm:complexity-r2-convex}, and obtain + \begin{align*} + f(x_{q_k + 1}) & \leq f(x_{q_k}) - \sigma_{q_k} \|s_k\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2 + \\ & \leq f(x_\star) - \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \sigma_{q_k} \|s_{q_k}\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2 + \\ & \leq f(x_\star) - 2 \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \sigma_{q_k} \|s_{q_k}\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2, + \end{align*} + where the last inequality also follows from convexity: \(-\sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) = \nabla f(x_{q_k})^T (x_{q_k} - x_\star) \geq f(x_{q_k}) - f(x_\star) \geq 0\). + We divide both sides by \(\sigma_{q_k}\), use the fact that \(\sigma_{q_k} \leq \sigma_{\max}\), and invoke~\eqref{eq:diff-distance} again to obtain + \begin{align*} + \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{\max} } \leq \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{q_k} } & \leq - 2 s_{q_k}^T (x_{q_k} - x_\star) - \|s_{q_k}\|^2 + (1 - \eta_1) \|s_{q_k}\|^2 + \\ & = \|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2 + (1 - \eta_1) \|s_{q_k}\|^2. + \end{align*} + Now, for each \(q_k\), we may use the fact that \(\sigma_k \geq \sigma_{\min} > 0\) to deduce + \[ + f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k} \|s_{q_k}\|^2 \geq \eta_1 \sigma_{\min} \|s_{q_k}\|^2, + \] + so that + \[ + \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{\max} } \leq \|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2 + \frac{ 1 - \eta_1 }{\eta_1 \sigma_{\min} } (f(x_{q_k}) - f(x_{q_k + 1})). + \] + We continue as in the proof of \Cref{lem:1/N-complexity}, but this time, the final term above contributes an extra term to the complexity bound: + \[ + \sum_{k = 0}^{N - 1} f(x_{q_k}) - f(x_\star) \leq \sigma_{\max} \|x_0 - x_\star\|^2 + \frac{ (1 - \eta_1) \sigma_{\max} }{ \eta_1 \sigma_{\min} } (f_0 - f(x_\star)), + \] + which yields~\eqref{eq:1/N-complexity-r2-sigma-min}. +\end{proof} + +\Cref{thm:complexity-r2-convex-sigma-min} shows that removing the assumption that \(\eta_1 \geq \tfrac{1}{2}\) comes at the cost of a worse constant term in the complexity bound. +In addition, it is no longer clear whether \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing. +In fact, generally, it is not, as the following example shows. +Consider \(f(x) = e^{x / \alpha} + e^{-\alpha x}\) with \(\alpha = 10\), and \(x_k = 0\). +Here, \(x_\star = 10 \log(100) / 101 \approx 0.456\). +We have \(f(x_k) = 2\) and \(f'(x_k) = -99/10\). +With \(\sigma_k = 99 / 10 = 9.9\), we compute \(s_k = -\sigma_k^{-1} f'(x_k) = 1\), hence, \(x_k + s_k = 1\). +The model decrease is \(\sigma_k \|s_k\|^2 = 9.9\). +With \(f(1) \approx 0.1\), the actual decrease is \(2 - f(1) \approx 0.9\). +The step \(s_k\) will be accepted for any \(\eta_1 \lesssim 0.9 / 9.9 \approx 0.09\). +However, \(x_{k+1} = 1\) is further away from \(x_\star\) than \(x_k\) was. +With \(\eta_1 \geq \tfrac{1}{2}\), \(s_k\) would be rejected and \(\sigma_k\) would increase to compute a shorter step. + +\subsection{Sharpness of the Complexity Bound for Convex \(\boldmath f\)} + +Idea: use convex interpolation. + +\subsection{Worst-Case Complexity for Strongly Convex \(\boldmath f\)} + +\begin{theorem}% + \label{thm:complexity-r2-strongly-convex} + Assume that \(f\) is strongly convex with modulus \(\mu > 0\) and that \(\nabla f\) is Lipschitz-continuous with constant \(L\). + Assume also that either + \begin{enumerate} + \item \(\sigma_0 > \gamma_2 L / (1 - \eta_2)\) and \(\eta_2 \geq \tfrac{1}{2}\), or + \item \(\sigma_0 \leq \gamma_2 L / (1 - \eta_2)\) and \(\eta_1 < \sigma_0 / (2 \mu)\) + \end{enumerate} + in \Cref{alg:r2}. + For each \(k \in \N\), + \begin{equation} + \label{eq:complexity-r2-strongly-convex} + f(x_{q_k + 1}) - f(x_\star) \leq C (f(x_{q_k}) - f(x_\star)), + \quad \text{where} \quad 0 < C \eqdef 1 - 2 \mu \eta_1 \sigma_{\max}^{-1} < 1. + \end{equation} +\end{theorem} + +\begin{proof} + For each successful iteration \(q_k\), + \[ + f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k}^{-1} \|\nabla f(x_k)\|^2 \geq 2 \mu \eta_1 \sigma_{\max}^{-1} (f(x_{q_k}) - f(x_\star)), + \] + where we used the fact that \(\sigma_{q_k} \leq \sigma_{\max}\) and~\eqref{eq:strongly-convex-gradient}. + Equivalently, + \[ + f(x_{q_k + 1}) \leq f(x_{q_k}) - 2 \mu \eta_1 \sigma_{\max}^{-1} (f(x_{q_k}) - f(x_\star)). + \] + We obtain the desired result by subtracting \(f(x_\star)\) from both sides. + The two alternative assumptions ensure that \(0 < C < 1\) given~\eqref{eq:sigma_max-r2}. +\end{proof} + +In practice, small values of \(\eta_1\) and large values of \(\eta_2\) tend to perform well. +In view of the assumptions of \Cref{thm:complexity-r2-strongly-convex} and the fact that \(L\) is generally unknown, it is safe to select \(\eta_1 < \sigma_0 / (2 \mu)\) and \(\eta_2 \geq \tfrac{1}{2}\). + +\section{R2N}% +\label{sec:r2n} + +Method R2N \citep{diouane-habiboullah-orban-2024b} expands upon R2 by using a quadratic approximation of \(f\) in~\eqref{eq:phi-r2}: +\begin{equation}% + \label{eq:phi-r2n} + \varphi(s; x_k) \eqdef f(x_k) + \nabla f(x_k)^T s + \tfrac{1}{2} s^T B_k s, +\end{equation} +where \(B_k = B_k^T \in \R^{n \times n}\). + +At each iteration, a step \(s_k\) is computed based on model~\eqref{eq:model-r2} where \(\varphi(s; x_k)\) is as in~\eqref{eq:phi-r2n}. +Step \(s_k\) need not be an approximate minimizer of~\eqref{eq:model-r2}, but need only result in a decrease at least equal to that resulting from the \emph{Cauchy step} \(\skcp\), which is a step in the direction \(-\nabla f(x_k)\) with a steplength chosen to ensure decrease. +In \Cref{alg:r2n}, said steplength \(\nu_k\) is chosen as in \citep{diouane-habiboullah-orban-2024b}, but other choices are possible. +However, the precise definition of the Cauchy steplength does not affect the worst-case complexity. + +\begin{algorithm}[ht]% + \caption[caption]{% + \label{alg:r2n} + R2N\@: A modified Quasi-Newton method. + } + \begin{algorithmic}[1]% + \State Choose constants \(0 < \theta_1 < 1 < \theta_2 \), \(0 < \eta_1 \leq \eta_2 < 1\) and \(0 < \gamma_3 \leq 1 < \gamma_1 \leq \gamma_2\). + \State Choose \(x_0 \in \R^n\), \(\sigma_0 > 0\). + \For{\(k = 0, 1, \dots\)} + \State% + \label{alg:r2n:Bk} + Choose \(B_k = B_k^T \in \R^{n \times n}\). + \State% + \label{alg:r2n:step-nuk} + Compute \(\nu_k \eqdef \theta_1 / (\|B_k\| + \sigma_k)\). + \State% + \label{alg:r2n:step-switch} + Compute \(\skcp \eqdef -\nu_k \nabla f(x_k)\). + \State% + \label{alg:r2n:step-computation} + Compute a step \(s_k\) such that \(m(s_k; x_k, \sigma_k) \le m(\skcp; x_k, \sigma_k)\). + \State% + \label{alg:r2n:step-comparison} + If \(\|s_k\| > \theta_2 \; \|\skcp\|\), reset \(s_k = \skcp\). + \State% + \label{alg:r2n:step-rhok} + Compute the ratio + \[ + \rho_k := + \frac{ + f(x_k) - f(x_k + s_k) + }{ + \varphi(0; x_k) - \varphi(s_k; x_k) + }. + \] + \State% + \label{alg:r2n:step-accept}% + If \(\rho_k \geq \eta_1\), set \(x_{k+1} = x_k + s_k\). + Otherwise, set \(x_{k+1} = x_k\). + \State% + \label{alg:r2n:step-update}% + Update the regularization parameter according to + \[ + \sigma_{k+1} \in + \begin{cases} + \begin{aligned} + & [\gamma_3 \sigma_k, \, \sigma_k] & & \text{ if } \rho_k \geq \eta_2, & & \quad \text{very successful iteration} \\ + & [\sigma_k, \, \gamma_1 \sigma_k] & & \text{ if } \eta_1 \leq \rho_k < \eta_2, & & \quad \text{successful iteration} \\ + & [\gamma_1 \sigma_k, \, \gamma_2 \sigma_k] & & \text{ if } \rho_k < \eta_1. & & \quad \text{unsuccessful iteration} + \end{aligned} + \end{cases} + \] + \EndFor + \end{algorithmic} +\end{algorithm} + +\subsection{Worst-Case Complexity} + +\begin{assumption}% + \label{asm:Bk-bounded} + There exists \(\kappa_B > 0\) such that \(\|B_k\| \leq \kappa_B\) for all \(k \in \N\). +\end{assumption} + +When \(f\) is convex, it stands to reason to choose \(B_k \succeq 0\) in \Cref{alg:r2n} so that~\eqref{eq:model-r2} is also convex in \(s\). + +\begin{assumption}% + \label{asm:sk-exact-r2n} + For all \(k \in \N\), \(s_k\) solves \((B_k + \sigma_k I) s_k = -\nabla f(x_k)\) in \Cref{alg:r2n} and \(B_k \succeq 0\). +\end{assumption} + +Under \Cref{asm:sk-exact-r2n}, +\begin{equation}% + \label{eq:diff-phi-r2n} + \varphi(0; x_k) - \varphi(s_k; x_k) = s_k^T (B_k + \sigma_k I) s_k - \tfrac{1}{2} s_k^T B_k s_k = s_k^T (\tfrac{1}{2} B_k + \sigma_k I) s_k \geq \sigma_k \|s_k\|^2. +\end{equation} + +Under \Cref{asm:Bk-bounded},~\eqref{eq:sigma_max-r2} continues to hold with the same value of \(\sigma_{\max}\).\footnote{The proof is the same as that of \citep[Theorem~\(4.1\)]{aravkin-baraldi-orban-2024}. +Model Assumption~\(6.1\) of \citep{diouane-habiboullah-orban-2024b} is unnecessarily loose under \Cref{asm:Bk-bounded} so that extra constants appear in the \(\sigma_{\max}\) that follows from \citep[Lemma~\(6.1\)]{diouane-habiboullah-orban-2024b}.} + +\subsection{Sharpness of the Complexity Bound} \section{Discussion and future work}% \label{sec:discussion} @@ -72,4 +546,12 @@ \subsection*{Acknowledgements} \bibliography{abbrv,report} \normalsize +\clearpage +\appendix +\section{Open Questions} + +\begin{enumerate} + \item Can we remove the assumption that there exists \(x_\star\) that attains \(f_{\mathrm{low}}\)? +\end{enumerate} + \end{document}