From 3e7facaa84a719b0b59a6ef19bad1d63c767bd56 Mon Sep 17 00:00:00 2001
From: Dominique Orban <dominique.orban@gmail.com>
Date: Sat, 19 Apr 2025 10:58:17 -0400
Subject: [PATCH] results for R2

---
 paper/defs.sty   |   2 +-
 paper/report.bib |  91 +++++++++
 paper/report.tex | 502 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 584 insertions(+), 11 deletions(-)

diff --git a/paper/defs.sty b/paper/defs.sty
index d6d8e98..78360e2 100644
--- a/paper/defs.sty
+++ b/paper/defs.sty
@@ -58,7 +58,7 @@
 %\renewcommand{\year}{2020}     % If you don't want the current year.
 \newcommand{\cahiernumber}{00}  % Insert your Cahier du GERAD number.
 
-\usepackage[margin=1in]{geometry}
+\usepackage[margin=1.5in]{geometry}
 \usepackage[T1]{fontenc}
 \usepackage{amsthm}  % must come before newpxtext and newpxmath
 \usepackage{amssymb}
diff --git a/paper/report.bib b/paper/report.bib
index e69de29..a21cf8e 100644
--- a/paper/report.bib
+++ b/paper/report.bib
@@ -0,0 +1,91 @@
+@Article{         aravkin-baraldi-orban-2022,
+  Author        = {Aravkin, A Y and Baraldi, R and Orban, D},
+  Title         = {A Proximal Quasi-{N}ewton Trust-Region Method for Nonsmooth Regularized Optimization},
+  Journal       = siopt,
+  Year          = 2022,
+  Volume        = 32,
+  Number        = 2,
+  Pages         = {900--929},
+  doi           = {10.1137/21M1409536},
+  abstract      = { We develop a trust-region method for minimizing the sum of a smooth term (f) and a nonsmooth term (h), both of which can be nonconvex. Each iteration of our method minimizes a possibly nonconvex model of (f + h) in a trust region. The model coincides with (f + h) in value and subdifferential at the center. We establish global convergence to a first-order stationary point when (f) satisfies a smoothness condition that holds, in particular, when it has a Lipschitz-continuous gradient, and (h) is proper and lower semicontinuous. The model of (h) is required to be proper, lower semi-continuous and prox-bounded. Under these weak assumptions, we establish a worst-case (O(1/\epsilon^2)) iteration complexity bound that matches the best known complexity bound of standard trust-region methods for smooth optimization. We detail a special instance, named TR-PG, in which we use a limited-memory quasi-Newton model of (f) and compute a step with the proximal gradient method,
+                  resulting in a practical proximal quasi-Newton method. We establish similar convergence properties and complexity bound for a quadratic regularization variant, named R2, and provide an interpretation as a proximal gradient method with adaptive step size for nonconvex problems. R2 may also be used to compute steps inside the trust-region method, resulting in an implementation named TR-R2. We describe our Julia implementations and report numerical results on inverse problems from sparse optimization and signal processing. Both TR-PG and TR-R2 exhibit promising performance and compare favorably with two linesearch proximal quasi-Newton methods based on convex models. },
+}
+
+@article{aravkin-baraldi-orban-2024,
+  author = {Aravkin, Aleksandr Y. and Baraldi, Robert and Orban, Dominique},
+  title = {A {L}evenberg–{M}arquardt Method for Nonsmooth Regularized Least Squares},
+  journal = sisc,
+  volume = {46},
+  number = {4},
+  pages = {A2557--A2581},
+  year = {2024},
+  doi = {10.1137/22M1538971},
+  preprint = {https://www.gerad.ca/en/papers/G-2022-58/view},
+  abstract = { Abstract. We develop a Levenberg–Marquardt method for minimizing the sum of a smooth nonlinear least-squares term \(f(x) = \frac{1}{2} \|F(x)\|\_2^2\) and a nonsmooth term \(h\). Both \(f\) and \(h\) may be nonconvex. Steps are computed by minimizing the sum of a regularized linear least-squares model and a model of \(h\) using a first-order method such as the proximal gradient method. We establish global convergence to a first-order stationary point under the assumptions that \(F\) and its Jacobian are Lipschitz continuous and \(h\) is proper and lower semicontinuous. In the worst case, our method performs \(O(\epsilon^{-2})\) iterations to bring a measure of stationarity below \(\epsilon \in (0, 1)\). We also derive a trust-region variant that enjoys similar asymptotic worst-case iteration complexity as a special case of the trust-region algorithm of Aravkin, Baraldi, and Orban [SIAM J. Optim., 32 (2022), pp. 900–929]. We report numerical results on three examples: a group-lasso basis-pursuit denoise example, a nonlinear support vector machine, and parameter estimation in a neuroscience application. To implement those examples, we describe in detail how to evaluate proximal operators for separable \(h\) and for the group lasso with trust-region constraint. In all cases, the Levenberg–Marquardt methods perform fewer outer iterations than either a proximal gradient method with adaptive step length or a quasi-Newton trust-region method, neither of which exploit the least-squares structure of the problem. Our results also highlight the need for more sophisticated subproblem solvers than simple first-order methods. }
+}
+
+@TechReport{      aravkin-baraldi-leconte-orban-2021,
+  Author        = {Aravkin, Aleksandr Y. and Baraldi, Robert and Leconte, Geoffroy and Orban, Dominique},
+  Title         = {Corrigendum: A proximal quasi-{N}ewton trust-region method for nonsmooth regularized optimization},
+  Institution   = gerad,
+  Year          = 2024,
+  Type          = {Cahier},
+  Number        = {G-2021-12-SM},
+  Address       = gerad-address,
+  Pages         = {1--3},
+  doi           = {10.13140/RG.2.2.36250.45768},
+}
+
+@Book{            cartis-gould-toint-2022,
+  Author        = {Cartis, Coralia and Gould, Nicholas I. M. and Toint, {\relax Ph}ilippe L},
+  Title         = {Evaluation Complexity of algorithms for nonconvex optimization},
+  Publisher     = siam,
+  Year          = 2022,
+  Series        = {MOS-SIAM Series on Optimization},
+  Address       = siam-address,
+  doi           = {10.1137/1.9781611976991},
+  Number        = 30,
+}
+
+@techreport{diouane-habiboullah-orban-2024a,
+  author = {Y. Diouane and M. L. Habiboullah and D. Orban},
+  pages = {},
+  title = {Complexity of trust-region methods in the presence of unbounded Hessian approximations},
+  year = {2024},
+  number = {G-2024-43},
+  type = {Cahier},
+  institution = gerad,
+  address = gerad-address,
+  doi = {},
+  preprint = {https://www.gerad.ca/en/papers/G-2024-43},
+}
+
+@techreport{diouane-habiboullah-orban-2024b,
+  author = {Y. Diouane and M. L. Habiboullah and D. Orban},
+  pages = {},
+  title = {A Proximal Modified Quasi-{N}ewton Method for Nonsmooth Regularized Optimization},
+  year = {2024},
+  number = {G-2024-64},
+  type = {Cahier},
+  institution = gerad,
+  address = gerad-address,
+  doi = {10.13140/RG.2.2.21140.51840},
+  preprint = {https://www.gerad.ca/en/papers/G-2024-64},
+}
+
+@incollection{wright-2018,
+    author = {S. J. Wright},
+    title = {Optimization Algorithms for Data Analysis},
+    booktitle = {The Mathematics of Data},
+    publisher = ams,
+    editor = {M. W. Mahoney and J. C. Duchi and A. C. Gilbert},
+    volume = {25},
+    number = {},
+    series = {IAS/Park City Mathematics Series},
+    chapter = {2},
+    pages = {49--98},
+    address = ams-address,
+    edition = {1st},
+    year = {2018},
+    doi = {10.1090/pcms/025/00830},
+}
diff --git a/paper/report.tex b/paper/report.tex
index 7e0fcae..5f9290f 100644
--- a/paper/report.tex
+++ b/paper/report.tex
@@ -1,8 +1,16 @@
 \documentclass[10pt]{article}
 \usepackage{defs}
 
+% https://tex.stackexchange.com/a/4881/2701
+\makeatletter
+\newcommand*{\eqdef}{\mathrel{\vcenter{\baselineskip0.5ex \lineskiplimit0pt
+                     \hbox{\scriptsize.}\hbox{\scriptsize.}}}%
+                     =}
+\makeatother
+\newcommand{\skcp}{s_{k, \textup{cp}}}
+
 \newcommand{\papertitle}{%
-  Insert Your Title Here
+  Complexity of Methods for General Optimization in the Presence of Convexity
 }
 
 % For debugging.
@@ -11,9 +19,9 @@
 % Meta-information for the PDF file generated..
 \hypersetup{
   pdftitle={\papertitle},
-  pdfauthor={Author One and Author Two},
-  pdfsubject={Report Subject},
-  pdfkeywords={Keyword1, keyword2, keyword3},
+  pdfauthor={Youssef Diouane and Mohamed Laghdaf Habiboullah and Dominique Orban},
+  pdfsubject={Wost-Case Evaluation Complexity},
+  pdfkeywords={complexity, convexity},
 }
 
 % \usepackage{newunicodechar}
@@ -21,8 +29,13 @@
 
 \title{\papertitle}
 \author{%
-  Author One\footnote{%
-    GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:geoffroy.leconte@polymtl.ca}{geoffroy.leconte@polymtl.ca}.
+  Youssef Diouane\footnote{%
+    GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:youssef.diouane@polymtl.ca}{youssef.diouane@polymtl.ca}.
+  }
+   \thanks{Research supported by an NSERC Discovery grant.}
+  \and
+  Mohamed L. Habiboullah\footnote{%
+    GERAD and Department of Mathematics and Industrial Engineering, Polytechnique Montr\'eal. E-mail: \href{mailto:mohamed.habiboullah@polymtl.ca}{mohamed.habiboullah@polymtl.ca}.
   }
   \and
   Dominique Orban\footnote{%
@@ -38,6 +51,11 @@
 \thispagestyle{mytitlepage}
 
 \begin{abstract}
+  Methods for unconstrained convex optimization are typically more straightfoward than methods designed for the same problem class but in which the objective may be nonconvex.
+  Accordingly, the former benefit from stronger convergence results and more favorable worst-case complexity results.
+  But do methods designed for general problems have worse worst-case complexity when applied to convex problems than methods designed for convex problems?
+  If so, how much worse?
+  In this research, we study and provide answers to these legitimate questions.
 \end{abstract}
 
 % Résumé en français pour le Cahier du GERAD
@@ -46,21 +64,477 @@
 
 \pagestyle{myheadings}
 
+\tableofcontents
+
 \section{Introduction}%
 \label{sec:introduction}
 
+We consider the unconstrained problem
+\begin{equation}%
+  \label{eq:nlo}
+  \minimize{x \in \R^n} \ f(x),
+\end{equation}
+where \(f: \R^n \to \R\) is \(C^1\), and study the worst-case evaluation complexity of methods initially designed for general \(f\) when applied to convex \(f\).
+We assume that \(\nabla f\) is Lipschitz-continuous with constant \(L \geq 0\).
+
 \subsection*{Related research}
 
 \subsection*{Notation}
 
+For a set \(\mathcal{S}\), we denote \(|\mathcal{S}|\) its cardinality.
+
 \section{Background}%
 \label{sec:background}
 
-\section{Another Section}%
-\label{sec:another-section}
+\citet{cartis-gould-toint-2022} and \citet{diouane-habiboullah-orban-2024a} study the worst-case evaluation complexity of trust-region and regularization methods when applied to convex \(f\) under the assumption that there exists \(R \geq 1\) such that the level set \(\{x \in \R^n \mid f(x) \leq f(x_0)\}\) is contained within the ball centered at \(x_\star\) of radius \(R\).
+Such bounded level set assumption is not particularly related to strict of strong convexity.
+For instance, the function of one variable
+\[
+  f(x) =
+  \begin{cases}
+    x^2 & \text{ if } x < 0 \\
+    0   & \text{ if } x \geq 0,
+  \end{cases}
+\]
+has a well-defined \(x_\star = 0\) (or, indeed, any \(x \geq 0\)), is convex but not strictly convex, but does not satisfy the assumption.
+The function \(f(x) = x^4\) is strictly, but not strongly, convex and satisfies the assumption.
+The function \(f(x) = |x|\) is not strictly convex but satisfies the assumption.
+Any smooth and strongly convex \(f\) for which a (global) minimum is attained satsifies the assumption.
+Indeed, for such \(f\), there exists \(\mu > 0\) such that, for all \(y \in \R^n\),
+\[
+  f(y) \geq f(x_\star) + \nabla f(x_\star)^T (y - x_\star) + \tfrac{1}{2} \mu \|y - x_\star\|^2 = f(x_\star) + \tfrac{1}{2} \mu \|y - x_\star\|^2.
+\]
+Thus, if \(x\) is such that \(f(x) \leq f(x_0)\),
+\[
+  \tfrac{1}{2} \mu \|x - x_\star\|^2 \leq f(x) - f(x_\star) \leq f(x_0) - f(x_\star),
+\]
+so that we may pick \(R = \max(1, \, \sqrt{2 (f(x_0) - f(x_\star)) / \mu})\).
+
+If \(f\) is strongly convex with modulus \(\mu > 0\), it possesses a unique minimizer \(x_\star\) \citep[Theorem~\(3.3.14\)]{wright-2018}.
+For all \(x\), \(y \in \R^n\),
+\[
+  f(y) \geq f(x) + \nabla f(x)^T (y - x) + \tfrac{1}{2} \mu \|y - x\|^2.
+\]
+We minimize each side with respect to \(y\), note that the minimum of the right-hand side is attained for \(y = x - \mu^{-1} \nabla f(x)\), and obtain
+\[
+  f(x_\star) \geq f(x) - \tfrac{1}{2} \mu^{-1} \|\nabla f(x)\|^2.
+\]
+Thus, for all \(x \in \R^n\),
+\begin{equation}%
+  \label{eq:strongly-convex-gradient}
+  \|\nabla f(x)\|^2 \geq 2 \mu (f(x) - f(x_\star)).
+\end{equation}
+
+In addition, if \(f\) is strongly convex with modulus \(\mu > 0\) and \(\nabla f\) is Lipschitz-continuous with constant \(L \geq 0\), then \(L \geq \mu\).
+Indeed, for all \(x\), \(s \in \R^n\),
+\begin{equation}%
+  \label{eq:strongly-convex-quadratic-bounds}
+  f(x) + \nabla f(x)^T s + \tfrac{1}{2} \mu \|s\|^2 \leq f(x + s) \leq f(x) + \nabla f(x)^T s + \tfrac{1}{2} L \|s\|^2.
+\end{equation}
+If \(x = x_\star\) is a stationary point, hence a global minimizer,~\eq can be written
+\[
+  \tfrac{1}{2} \mu \|x - x_\star\|^2 \leq f(x) - f(x_\star) \leq \tfrac{1}{2} L \|x - x_\star\|^2
+  \quad \text{for all} \quad x \in \R^n.
+\]
+
+\section{Preliminary Results}%
+\label{sec:prelims}
+
+Because \(\nabla f\) is Lipschitz-continuous, for all \(x\), \(s \in \R^n\),
+\begin{equation}%
+  \label{eq:lipschitz}
+  |f(x + s) - f(x) - \nabla f(x)^T s| \leq \tfrac{1}{2} L \|s\|^2.
+\end{equation}
+
+Consider a method that generates iterates according to \(x_{k+1} = x_k + s_k\) where \(s_k\) is the step at iteration \(k\).
+Let \(x_\star\) be fixed.
+We will use the identity
+\begin{equation}%
+  \label{eq:diff-distance}
+  \|x_k - x_\star\|^2 - \|x_{k+1} - x_\star\|^2 =
+  \|x_k - x_\star\|^2 - \|x_k + s_k - x_\star\|^2 =
+  -s_k^T (x_k - x_\star) - \|s_k\|^2
+\end{equation}
+repeatedly.
+
+We consider methods for~\eqref{eq:nlo} that generate \(\{x_k\}\) such that \(\{f(x_k)\}\) is nonincreasing.
+Because not all steps are accepted, we denote \(q_k\) the index of the \(k\)-th iteration where the step is accepted, i.e., \(x_{q_k + 1} = x_{q_k} + s_{q_k}\).
+Such an iteration is called \emph{successful}.
+Note that \(q_{k+1} \geq q_k + 1\), and therefore, \(\{q_k\}\) is increasing.
+On a successful iteration \(k\), \(f(x_{k+1}) < f(x_k)\).
+On an \emph{unsuccessful} iteration \(k\), the step \(s_k\) is rejected and \(x_{k+1} = x_k\).
+Therefore,
+\begin{equation}%
+  \label{eq:qk-identity}
+  x_{q_k + 1} = x_{q_{k + 1}}
+  \quad (k \in \N).
+\end{equation}
+We refer to a method possessing the features above as a \emph{descent} method.
+
+The following result appears inside the proof of \citep[Theorem~\(4.3.1\)]{wright-2018} but does not require convexity of \(f\).
+
+\begin{lemma}%
+  \label{lem:1/N-complexity}
+  Consider a descent method for~\eqref{eq:nlo}.
+  Let \(x_\star \in \R^n\) be such that \(f(x_k) \geq f(x_\star)\) for all \(k\).
+  Assume that there exists a constant \(C > 0\) such that, for all \(k\),
+  \begin{equation}%
+    \label{eq:decrease}
+    f(x_{q_k + 1}) \leq f(x_\star) + C (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2).
+  \end{equation}
+  For all integer \(N > 0\),
+  \begin{equation}%
+    \label{eq:1/N-complexity}
+    f(x_{q_N}) - f(x_\star) \leq \frac{C}{N} \|x_0 - x_\star\|^2.
+  \end{equation}
+\end{lemma}
+
+\begin{proof}
+  Let \(N > 0\).
+  We sum~\eqref{eq:decrease} over \(k = 0, \ldots, N-1\), use~\eqref{eq:qk-identity} to recover a telescoping sum, and obtain
+  \begin{align*}
+    \sum_{k=0}^{N-1} (f(x_{q_k+1}) - f(x_\star)) & \leq C \sum_{k=0}^{N-1} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2)
+    \\                                           & = C \sum_{k=0}^{N-1} (\|x_{q_k} - x_\star\|^2 - \|x_{q_{k + 1}} - x_\star\|^2)
+    \\                                           & = C (\|x_0 - x_\star\|^2 - \|x_{q_N} - x_\star\|^2)
+    \\                                           & \leq C \|x_0 - x_\star\|^2,
+  \end{align*}
+  The result follows by noting that because \(\{f(x_k)\}\) is nonincreasing,
+  \begin{align*}
+    f(x_{q_N}) - f(x_\star) \leq \frac{1}{N} \sum_{k=0}^{N-1} (f(x_{q_k + 1}) - f(x_\star)).
+    \tag*{\qedhere}
+  \end{align*}
+\end{proof}
+
+\section{R2}%
+\label{sec:r2}
+
+It is instructive to begin with a first-order method.
+The main differences between the analysis of R2 and that of \citet[\S\(4.3\)]{wright-2018} are that R2 is not a fixed-steplength method and not all steps are accepted, i.e., sufficient decrease may not occur at every iteration.
+The main advantage of R2 over fixed-steplength methods is that it removes the need to know the Lipschitz constant of \(\nabla f\).
+At each iteration, R2 forms the first-order approximation
+\begin{equation}%
+  \label{eq:phi-r2}
+  \varphi(s; x_k) \eqdef f(x_k) + \nabla f(x_k)^T s \approx f(x_k + s),
+\end{equation}
+and computes a step by minimizing the model
+\begin{equation}%
+  \label{eq:model-r2}
+  m(s; x_k, \sigma_k) \eqdef \varphi(s; x_k) + \tfrac{1}{2} \sigma_k \|s\|^2,
+\end{equation}
+where \(\sigma_k > 0\) is a regularization parameter.
+The situation here is sufficiently simple that the explicit formula
+\begin{equation}%
+  \label{eq:step-r2}
+  s_k \eqdef -\sigma_k^{-1} \nabla f(x_k)
+\end{equation}
+emerges for the step.
+The R2 algorithm is stated as \Cref{alg:r2}.
+
+\begin{algorithm}
+  \caption[caption]{%
+    R2: Quadratic Regularization.%
+    \label{alg:r2}
+  }
+  \begin{algorithmic}[1]
+    \State Choose constants \(0 < \eta_1 \leq \eta_2 < 1\) and \(0 < \gamma_3 \leq 1 < \gamma_1 \leq \gamma_2\).
+    \State%
+    \label{alg:r2-init}
+    Choose \(x_0 \in \R^n\), \(\sigma_0 > 0\), compute \(f(x_0) + h(x_0)\).
+    \For{\(k = 0, 1, \dots\)}
+      \State Compute the step \(s_k \eqdef -\sigma_k^{-1} \nabla f(x_k)\).
+      \State Compute the ratio
+      \[
+      \rho_k :=
+      \frac{
+        f(x_k) - f(x_k + s_k)
+      }{
+        \varphi(0; x_k) - \varphi(s_k; x_k)
+      }.
+      \]
+      \State If \(\rho_k \geq \eta_1\), set \(x_{k+1} = x_k + s_k\). Otherwise, set \(x_{k+1} = x_k\).
+      \State%
+      \label{alg:r2-update-sigma}
+      Update the regularization parameter according to
+      \[
+        \sigma_{k+1} \in
+        \begin{cases}
+          \begin{aligned}
+             & [\gamma_3 \sigma_k, \, \sigma_k]          &  & \text{ if } \rho_k \geq \eta_2,          &  & \quad \text{very successful iteration} \\
+             & [\sigma_k, \, \gamma_1 \sigma_k]          &  & \text{ if } \eta_1 \leq \rho_k < \eta_2, &  & \quad \text{successful iteration}      \\
+             & [\gamma_1 \sigma_k, \, \gamma_2 \sigma_k] &  & \text{ if } \rho_k < \eta_1.             &  & \quad \text{unsuccessful iteration}
+          \end{aligned}
+        \end{cases}
+      \]
+    \EndFor
+  \end{algorithmic}
+\end{algorithm}
+
+\subsection{Worst-Case Complexity for Convex \(\boldmath f\)}
+
+In the denominator of \(\rho_k\), note that
+\begin{equation}%
+  \label{eq:diff-phi-r2}
+  \varphi(0; x_k) - \varphi(s_k; x_k) = \sigma_k^{-1} \|\nabla f(x_k)\|^2 = \sigma_k \|s_k\|^2.
+\end{equation}
+
+Under Lipschitz-continuity of \(\nabla f\), \citet[Theorem~\(6.2\)]{aravkin-baraldi-orban-2022} establish that\footnote{By~\eqref{eq:lipschitz}, the constant \(\kappa_{\mathrm{m}}\) in \citet[Theorem~\(6.2\)]{aravkin-baraldi-orban-2022} can be taken as \(\tfrac{1}{2} L\).
+See also \citet{aravkin-baraldi-leconte-orban-2021}.}
+\begin{equation}%
+  \label{eq:sigma_max-r2}
+  \sigma_k \leq \sigma_{\max} \eqdef \max(\sigma_0, \frac{\gamma_2}{1 - \eta_2} L) > 0 \quad \text{for all } k \in \N.
+\end{equation}
+
+\begin{theorem}%
+  \label{thm:complexity-r2-convex}
+  Assume that \(f\) is convex, that there exists a (global) minimizer \(x_\star\) and that \(\eta_1 \geq \tfrac{1}{2}\) in \Cref{alg:r2}.  
+  The sequence \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing and for all integer \(N > 0\),
+  \begin{equation}%
+    \label{eq:1/N-complexity-r2}
+    f(x_{q_N}) - f(x_\star) \leq \frac{\sigma_{\max}}{2N} \|x_0 - x_\star\|^2.
+  \end{equation}
+\end{theorem}
+
+\begin{proof}
+  Let \(k \in \N\) and consider successful iteration \(q_k\).
+  The mechanism of \Cref{alg:r2},~\eqref{eq:diff-phi-r2} and our assumption that \(\eta_1 \geq \tfrac{1}{2}\) imply that
+  \[
+    f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 (\varphi(0; x_{q_k}) - \varphi(s_{q_k}; x_{q_k})) \geq \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2.
+  \]
+  By convexity of \(f\), \(f(x_\star) \geq f(x_{q_k}) + \nabla f(x_{q_k})^T (x_\star - x_{q_k})\).
+  Thus,
+  \begin{align*}
+    f(x_{q_k + 1}) & \leq f(x_{q_k}) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2
+    \\             & \leq f(x_\star) + \nabla f(x_{q_k})^T (x_{q_k} - x_\star) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2
+    \\             & = f(x_\star) - \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \tfrac{1}{2} \sigma_{q_k} \|s_{q_k}\|^2.
+  \end{align*}
+  We may now use~\eqref{eq:diff-distance} and~\eqref{eq:sigma_max-r2} to obtain
+  \[
+    f(x_{q_k + 1}) \leq f(x_\star) + \tfrac{1}{2} \sigma_{\max} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2).
+  \]
+  The majoration using~\eqref{eq:sigma_max-r2} is valid because the factor of \(\sigma_{q_k}\) above is
+  \[
+    \tfrac{1}{2} (\|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2) \geq \frac{ f(x_{q_k + 1}) - f(x_\star) }{ \sigma_{q_k} } \geq 0,
+  \]
+  which also shows that \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing.
+  The result follows from the application of \Cref{lem:1/N-complexity} with \(C = \tfrac{1}{2} \sigma_{\max}\).
+\end{proof}
+
+Because \Cref{thm:complexity-r2-convex} requires \(\eta_1 \geq \tfrac{1}{2}\) and the convergence analysis of \Cref{alg:r2} requires \(\eta_1 < \eta_2 < 1 < \gamma_2\), the most favorable constant in the right-hand side of~\eqref{eq:1/N-complexity-r2} occurs when \(\eta_2\) is close to \(\tfrac{1}{2}\) and \(\gamma_2\) is close to \(1\), in which case \(\sigma_{\max}\) is close to \(2L\).
+Thus \Cref{thm:complexity-r2-convex} suggests that the complexity of \Cref{alg:r2} could be worse than that of the constant-steplength gradient method by a factor of at least two.
+In addition, we must take unsuccessful iterations into account.
 
-\section{Implementation and numerical experiments}%
-\label{sec:numerical}
+At a general iteration \(k\), let \(j(k) \eqdef \max \{i \in \N \mid q_i \leq k\}\) be the number of successful iterations so far.
+For each unsuccessful iteration \(i\), \(\sigma_{i + 1} \geq \gamma_1 \sigma_i\) while for any successful iteration \(i\), \(\sigma_{i + 1} \geq \gamma_3 \sigma_i\).
+Thus,
+\[
+  \sigma_0 \gamma_1^{k - q_{j(k)}} \gamma_3^{q_{j(k)}} \leq \sigma_k \leq \sigma_{\max},
+\]
+and therefore, taking logarithms on both sides, the number of unsuccessful iterations so far is
+\begin{equation}%
+  \label{eq:num-unsuccessful}
+  k - q_{j(k)} \leq \frac{ \log(\sigma_{\max} / \sigma_0) }{ \log(\gamma_1) } + q_{j(k)} \frac{ |\log(\gamma_3)| }{ \log(\gamma_1) } = \log_{\gamma_1}(\sigma_{\max} / \sigma_0) + q_{j(k)} |\log_{\gamma_1}(\gamma_3)|.
+\end{equation}
+
+It is possible to do away with the assumption that \(\eta_1 \geq \tfrac{1}{2}\) in \Cref{thm:complexity-r2-convex} provided we introduce a minimum acceptable value of \(\sigma_k\).
+For a given \(\sigma_{\min} > 0\), assume we choose \(\sigma_0 \geq \sigma_{\min}\) at \Cref{alg:r2-init} and update \(\sigma_{k+1} \in [(\max(\sigma_{\min}, \, \gamma_3 \sigma_k), \, \sigma_k]\) at \Cref{alg:r2-update-sigma} of \Cref{alg:r2}.
+We then ensure that 
+\begin{equation}%
+  \label{eq:sigma-min}
+  \sigma_{\min} \leq \sigma_k \leq \sigma_{\max} \quad \text{for all } k \in \N.
+\end{equation}
+Accordingly, we obtain the following bound.
+
+\begin{theorem}%
+  \label{thm:complexity-r2-convex-sigma-min}
+  Assume that \(f\) is convex, that there exists a (global) minimizer \(x_\star\) and that \Cref{alg:r2} has been modified to enforce~\eqref{eq:sigma-min}.
+  For all integer \(N > 0\),
+  \begin{equation}%
+    \label{eq:1/N-complexity-r2-sigma-min}
+    f(x_{q_N}) - f(x_\star) \leq \frac{ \sigma_{\max} }{ N } \left( \|x_0 - x_\star\|^2 + \frac{ (1 - \eta_1) }{ \eta_1 \sigma_{\min} } (f(x_0) - f(x_\star)) \right).
+  \end{equation}
+\end{theorem}
+
+\begin{proof}
+  Let \(k \in \N\) and consider successful iteration \(q_k\).
+  The mechanism of \Cref{alg:r2} implies that
+  \[
+    f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k} \|s_{q_k}\|^2 = \sigma_{q_k} \|s_{q_k}\|^2 - (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2.
+  \]
+  We use convexity as in the proof of \Cref{thm:complexity-r2-convex}, and obtain
+  \begin{align*}
+    f(x_{q_k + 1}) & \leq f(x_{q_k}) - \sigma_{q_k} \|s_k\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2
+    \\             & \leq f(x_\star) - \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \sigma_{q_k} \|s_{q_k}\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2
+    \\             & \leq f(x_\star) - 2 \sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) - \sigma_{q_k} \|s_{q_k}\|^2 + (1 - \eta_1) \sigma_{q_k} \|s_{q_k}\|^2,
+  \end{align*}
+  where the last inequality also follows from convexity: \(-\sigma_{q_k} s_{q_k}^T (x_{q_k} - x_\star) = \nabla f(x_{q_k})^T (x_{q_k} - x_\star) \geq f(x_{q_k}) - f(x_\star) \geq 0\).
+  We divide both sides by \(\sigma_{q_k}\), use the fact that \(\sigma_{q_k} \leq \sigma_{\max}\), and invoke~\eqref{eq:diff-distance} again to obtain
+  \begin{align*}
+    \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{\max} } \leq \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{q_k} } & \leq - 2 s_{q_k}^T (x_{q_k} - x_\star) - \|s_{q_k}\|^2 + (1 - \eta_1) \|s_{q_k}\|^2
+    \\ & = \|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2 + (1 - \eta_1) \|s_{q_k}\|^2.
+  \end{align*}
+  Now, for each \(q_k\), we may use the fact that \(\sigma_k \geq \sigma_{\min} > 0\) to deduce
+  \[
+    f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k} \|s_{q_k}\|^2 \geq \eta_1 \sigma_{\min} \|s_{q_k}\|^2,
+  \]
+  so that
+  \[
+    \frac{ f(x_{q_k}) - f(x_\star) }{ \sigma_{\max} } \leq \|x_{q_k} - x_\star\|^2 - \|x_{q_k + 1} - x_\star\|^2 + \frac{ 1 - \eta_1 }{\eta_1 \sigma_{\min} } (f(x_{q_k}) - f(x_{q_k + 1})).
+  \]
+  We continue as in the proof of \Cref{lem:1/N-complexity}, but this time, the final term above contributes an extra term to the complexity bound:
+  \[
+    \sum_{k = 0}^{N - 1} f(x_{q_k}) - f(x_\star) \leq \sigma_{\max} \|x_0 - x_\star\|^2 + \frac{ (1 - \eta_1) \sigma_{\max} }{ \eta_1 \sigma_{\min} } (f_0 - f(x_\star)),
+  \]
+  which yields~\eqref{eq:1/N-complexity-r2-sigma-min}.
+\end{proof}
+
+\Cref{thm:complexity-r2-convex-sigma-min} shows that removing the assumption that \(\eta_1 \geq \tfrac{1}{2}\) comes at the cost of a worse constant term in the complexity bound.
+In addition, it is no longer clear whether \(\{\|x_{q_k} - x_\star\|\}\) is nonincreasing.
+In fact, generally, it is not, as the following example shows.
+Consider \(f(x) = e^{x / \alpha} + e^{-\alpha x}\) with \(\alpha = 10\), and \(x_k = 0\).
+Here, \(x_\star = 10 \log(100) / 101 \approx 0.456\).
+We have \(f(x_k) = 2\) and \(f'(x_k) = -99/10\).
+With \(\sigma_k = 99 / 10 = 9.9\), we compute \(s_k = -\sigma_k^{-1} f'(x_k) = 1\), hence, \(x_k + s_k = 1\).
+The model decrease is \(\sigma_k \|s_k\|^2 = 9.9\).
+With \(f(1) \approx 0.1\), the actual decrease is \(2 - f(1) \approx 0.9\).
+The step \(s_k\) will be accepted for any \(\eta_1 \lesssim 0.9 / 9.9 \approx 0.09\).
+However, \(x_{k+1} = 1\) is further away from \(x_\star\) than \(x_k\) was.
+With \(\eta_1 \geq \tfrac{1}{2}\), \(s_k\) would be rejected and \(\sigma_k\) would increase to compute a shorter step.
+
+\subsection{Sharpness of the Complexity Bound for Convex \(\boldmath f\)}
+
+Idea: use convex interpolation.
+
+\subsection{Worst-Case Complexity for Strongly Convex \(\boldmath f\)}
+
+\begin{theorem}%
+  \label{thm:complexity-r2-strongly-convex}
+  Assume that \(f\) is strongly convex with modulus \(\mu > 0\) and that \(\nabla f\) is Lipschitz-continuous with constant \(L\).
+  Assume also that either
+  \begin{enumerate}
+    \item \(\sigma_0 > \gamma_2 L / (1 - \eta_2)\) and \(\eta_2 \geq \tfrac{1}{2}\), or
+    \item \(\sigma_0 \leq \gamma_2 L / (1 - \eta_2)\) and \(\eta_1 < \sigma_0 / (2 \mu)\)
+  \end{enumerate}
+  in \Cref{alg:r2}.
+  For each \(k \in \N\),
+  \begin{equation}
+    \label{eq:complexity-r2-strongly-convex}
+    f(x_{q_k + 1}) - f(x_\star) \leq C (f(x_{q_k}) - f(x_\star)),
+    \quad \text{where} \quad 0 < C \eqdef 1 - 2 \mu \eta_1 \sigma_{\max}^{-1} < 1.
+  \end{equation}
+\end{theorem}
+
+\begin{proof}
+  For each successful iteration \(q_k\),
+  \[
+    f(x_{q_k}) - f(x_{q_k + 1}) \geq \eta_1 \sigma_{q_k}^{-1} \|\nabla f(x_k)\|^2 \geq 2 \mu \eta_1 \sigma_{\max}^{-1} (f(x_{q_k}) - f(x_\star)),
+  \]
+  where we used the fact that \(\sigma_{q_k} \leq \sigma_{\max}\) and~\eqref{eq:strongly-convex-gradient}.
+  Equivalently,
+  \[
+    f(x_{q_k + 1}) \leq f(x_{q_k}) - 2 \mu \eta_1 \sigma_{\max}^{-1} (f(x_{q_k}) - f(x_\star)).
+  \]
+  We obtain the desired result by subtracting \(f(x_\star)\) from both sides.
+  The two alternative assumptions ensure that \(0 < C < 1\) given~\eqref{eq:sigma_max-r2}.
+\end{proof}
+
+In practice, small values of \(\eta_1\) and large values of \(\eta_2\) tend to perform well.
+In view of the assumptions of \Cref{thm:complexity-r2-strongly-convex} and the fact that \(L\) is generally unknown, it is safe to select \(\eta_1 < \sigma_0 / (2 \mu)\) and \(\eta_2 \geq \tfrac{1}{2}\).
+
+\section{R2N}%
+\label{sec:r2n}
+
+Method R2N \citep{diouane-habiboullah-orban-2024b} expands upon R2 by using a quadratic approximation of \(f\) in~\eqref{eq:phi-r2}:
+\begin{equation}%
+  \label{eq:phi-r2n}
+  \varphi(s; x_k) \eqdef f(x_k) + \nabla f(x_k)^T s + \tfrac{1}{2} s^T B_k s,
+\end{equation}
+where \(B_k = B_k^T \in \R^{n \times n}\).
+
+At each iteration, a step \(s_k\) is computed based on model~\eqref{eq:model-r2} where \(\varphi(s; x_k)\) is as in~\eqref{eq:phi-r2n}.
+Step \(s_k\) need not be an approximate minimizer of~\eqref{eq:model-r2}, but need only result in a decrease at least equal to that resulting from the \emph{Cauchy step} \(\skcp\), which is a step in the direction \(-\nabla f(x_k)\) with a steplength chosen to ensure decrease.
+In \Cref{alg:r2n}, said steplength \(\nu_k\) is chosen as in \citep{diouane-habiboullah-orban-2024b}, but other choices are possible.
+However, the precise definition of the Cauchy steplength does not affect the worst-case complexity.
+
+\begin{algorithm}[ht]%
+  \caption[caption]{%
+    \label{alg:r2n}
+    R2N\@: A modified Quasi-Newton method.
+  }
+  \begin{algorithmic}[1]%
+    \State Choose constants  \(0 < \theta_1 < 1 < \theta_2 \), \(0 < \eta_1 \leq \eta_2 < 1\) and \(0 < \gamma_3 \leq 1 < \gamma_1 \leq \gamma_2\).
+    \State Choose \(x_0 \in \R^n\), \(\sigma_0 > 0\).
+    \For{\(k = 0, 1, \dots\)}
+    \State%
+    \label{alg:r2n:Bk}
+    Choose \(B_k = B_k^T \in \R^{n \times n}\).
+    \State%
+    \label{alg:r2n:step-nuk}
+    Compute \(\nu_k \eqdef \theta_1 / (\|B_k\| + \sigma_k)\).
+    \State% 
+    \label{alg:r2n:step-switch}
+    Compute \(\skcp \eqdef -\nu_k \nabla f(x_k)\).
+    \State%
+    \label{alg:r2n:step-computation}
+    Compute a step \(s_k\) such that \(m(s_k; x_k, \sigma_k) \le m(\skcp; x_k, \sigma_k)\).
+    \State%
+    \label{alg:r2n:step-comparison}
+    If \(\|s_k\| > \theta_2 \; \|\skcp\|\), reset \(s_k = \skcp\).
+    \State%
+    \label{alg:r2n:step-rhok}
+    Compute the ratio
+    \[
+      \rho_k :=
+      \frac{
+        f(x_k) - f(x_k + s_k)
+      }{
+        \varphi(0; x_k) - \varphi(s_k; x_k)
+      }.
+    \]
+    \State%
+    \label{alg:r2n:step-accept}%
+    If \(\rho_k \geq \eta_1\), set \(x_{k+1} = x_k + s_k\).
+    Otherwise, set \(x_{k+1} = x_k\).
+    \State%
+    \label{alg:r2n:step-update}%
+    Update the regularization parameter according to
+    \[
+      \sigma_{k+1} \in
+      \begin{cases}
+        \begin{aligned}
+           & [\gamma_3 \sigma_k, \, \sigma_k]          &  & \text{ if } \rho_k \geq \eta_2,          &  & \quad \text{very successful iteration} \\
+           & [\sigma_k, \, \gamma_1 \sigma_k]          &  & \text{ if } \eta_1 \leq \rho_k < \eta_2, &  & \quad \text{successful iteration}      \\
+           & [\gamma_1 \sigma_k, \, \gamma_2 \sigma_k] &  & \text{ if } \rho_k < \eta_1.             &  & \quad \text{unsuccessful iteration}
+        \end{aligned}
+      \end{cases}
+    \]
+    \EndFor
+  \end{algorithmic}
+\end{algorithm}
+
+\subsection{Worst-Case Complexity}
+
+\begin{assumption}%
+  \label{asm:Bk-bounded}
+  There exists \(\kappa_B > 0\) such that \(\|B_k\| \leq \kappa_B\) for all \(k \in \N\).
+\end{assumption}
+
+When \(f\) is convex, it stands to reason to choose \(B_k \succeq 0\) in \Cref{alg:r2n} so that~\eqref{eq:model-r2} is also convex in \(s\).
+
+\begin{assumption}%
+  \label{asm:sk-exact-r2n}
+  For all \(k \in \N\), \(s_k\) solves \((B_k + \sigma_k I) s_k = -\nabla f(x_k)\) in \Cref{alg:r2n} and \(B_k \succeq 0\).
+\end{assumption}
+
+Under \Cref{asm:sk-exact-r2n},
+\begin{equation}%
+  \label{eq:diff-phi-r2n}
+  \varphi(0; x_k) - \varphi(s_k; x_k) = s_k^T (B_k + \sigma_k I) s_k - \tfrac{1}{2} s_k^T B_k s_k = s_k^T (\tfrac{1}{2} B_k + \sigma_k I) s_k \geq \sigma_k \|s_k\|^2.
+\end{equation}
+
+Under \Cref{asm:Bk-bounded},~\eqref{eq:sigma_max-r2} continues to hold with the same value of \(\sigma_{\max}\).\footnote{The proof is the same as that of \citep[Theorem~\(4.1\)]{aravkin-baraldi-orban-2024}.
+Model Assumption~\(6.1\) of \citep{diouane-habiboullah-orban-2024b} is unnecessarily loose under \Cref{asm:Bk-bounded} so that extra constants appear in the \(\sigma_{\max}\) that follows from \citep[Lemma~\(6.1\)]{diouane-habiboullah-orban-2024b}.}
+
+\subsection{Sharpness of the Complexity Bound}
 
 \section{Discussion and future work}%
 \label{sec:discussion}
@@ -72,4 +546,12 @@ \subsection*{Acknowledgements}
 \bibliography{abbrv,report}
 \normalsize
 
+\clearpage
+\appendix
+\section{Open Questions}
+
+\begin{enumerate}
+  \item Can we remove the assumption that there exists \(x_\star\) that attains \(f_{\mathrm{low}}\)?
+\end{enumerate}
+
 \end{document}