\documentclass[JEP,XML,SOM,Unicode,NoEqCountersInSection,published]{cedram}
\datereceived{2022-09-07}
\dateaccepted{2023-03-20}
\dateepreuves{2023-03-22}

\usepackage[scr=rsfs,cal=euler]{mathalfa}
\newcommand{\psfrac}[2]{\sfrac{(#1)}{#2}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\newtheorem{corollary}{Corollary}

\theoremstyle{definition}
\newtheorem{remark}{Remark}
\newtheorem{question}{Question}

\newcommand{\dN}{\mathbb {N}}
\newcommand{\dZ}{\mathbb {Z}}
\newcommand{\dR}{\mathbb {R}}
\newcommand{\EE}{{\mathbb{E}}}
\newcommand{\PP}{{\mathbb{P}}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cF}{\mathcal {F}}
\newcommand{\lip}{\textup{\textsc{lip}}}
\newcommand{\dtv}{d_{\textup{\textsc{tv}}}}
\newcommand{\tv}{\textup{\textsc{tv}}}
\newcommand{\trel}{\mathrm{t}_{\textup{\textsc{rel}}}}
\newcommand{\tmix}{\mathrm{t}_{\textup{\textsc{mix}}}}
\newcommand{\pmin}{P_{\mathrm{min}}}
\DeclareMathOperator{\diam}{diam}
\DeclareMathOperator{\dist}{dist}

\datepublished{2023-03-24}
\begin{document}
\frontmatter
\title{Mixing time and expansion of non-negatively~curved Markov chains}

\author[\initial{F.} \lastname{Münch}]{\firstname{Florentin} \lastname{Münch}}
\address{Max-Planck-Institut für Mathematik in den Naturwissenschaften\\
Inselstr. 22, 04103 Leipzig, Germany}
\email{muench@mis.mpg.de}
\urladdr{https://sites.google.com/view/florentin-muench/home/}

\author[\initial{J.} \lastname{Salez}]{\firstname{Justin} \lastname{Salez}}
\address{Université Paris-Dauphine \& PSL, CEREMADE\\
Place du Maréchal de Lattre de Tassigny, F-75775 Paris Cedex 16, France}
\email{justin.salez@dauphine.psl.eu}
\urladdr{https://www.ceremade.dauphine.fr/~salez/}

\begin{abstract}
We establish three remarkable consequences of non-negative curvature for sparse Markov chains. First, their conductance decreases logarithmically with the number of states. Second, their displacement is at least diffusive until the mixing time. Third, they never exhibit the cutoff phenomenon. The first result provides a nearly sharp quantitative answer to a classical question of Ollivier, Milman \& Naor. The second settles a conjecture of Lee and Peres for graphs with non-negative curvature. The third offers a striking counterpoint to the recently established cutoff for non-negatively curved chains with uniform expansion.
\end{abstract}

\subjclass{60J10}

\keywords{Markov chains, random walks, expansion, discrete curvature, mixing times, cutoff phenomenon}

\altkeywords{Chaînes de Markov, marches aléatoires, expansion, courbure discrète, temps de mélange, phénomène de cutoff}

\alttitle{Temps de mélange et expansion des chaînes de Markov en courbure positive}

\begin{altabstract}
Nous établissons trois conséquences remarquables de la courbure positive pour les chaînes de Markov. D'abord, la conductance de ces chaînes décroît logarithmiquement avec la taille de l'espace. Ensuite, leur déplacement est diffusif jusqu'au temps de mélange. Enfin, le phénomène de cutoff ne peut pas se produire. Le premier résultat fournit une réponse quantitative presqu'optimale à une question classique d'Ollivier, Milman et Naor. Le second confirme une conjecture de Lee et Peres, dans le cas particulier des graphes à courbure positive. Le troisième offre un contraste frappant avec les résultats positifs récents concernant le cutoff pour les chaînes de Markov ayant à la fois une courbure positive et une expansion uniforme.
\end{altabstract}
\maketitle
\vspace*{-\baselineskip}
\tableofcontents
\mainmatter

\section{Introduction}
In Riemannian geometry, a lower bound on the Ricci curvature classically implies an array of powerful estimates for the underlying manifold, including diameter bounds, volume growth, comparison principles, splitting theorems, spectral estimates, and concentration inequalities \cite{jost2017a}. Over the past decade, those remarkable implications have motivated the development of non-smooth analogues of curvature that can be applied to discrete geometries
\cite{schmuckenschlager1998curvature,lin2010ricci,
forman2003bochner,erbar2012ricci,ollivier2009ricci,
jost2021characterizations,najman2017modern, munch2017ollivier}.
In particular, \hbox{Ollivier}~\cite{ollivier2009ricci} proposed a transportation-based definition that makes sense on arbitrary metric spaces, hence in particular on graphs and Markov chains.
Informally, a metric space has non-negative Ollivier-Ricci curvature if balls are at least as close to each other as their centers are. The simplest example of a finite non-negatively curved graph is a cycle. It is classical that this graph has poor expansion, that the random walk on it exhibits a diffusive behavior, and that its mixing time is of the same order as the inverse spectral gap. The aim of the present paper is to show that those three properties are in fact shared by all sparse Markov chains with non-negative curvature. Before we state our results in full generality, let us describe their content in the simple but important special case of random walk on graphs.

\subsubsection*{Non-negatively curved graphs.}Let $G=(V,E)$ be a finite simple graph, and let~$P$ denote the random-walk transition matrix of $G$. Thus, $P$ acts on any \hbox{function $f{:}\, V\!\to\!\dR$} as follows:
\begin{equation*}
(Pf)(x) = \frac{1}{\deg(x)}\sum_{y\sim x}f(y),
\end{equation*}
where the notation $y\sim x$ indicates that $\{x,y\}\in E$. Following Ollivier \cite{ollivier2009ricci,ollivier2010survey}, we~say that $G$ has \emph{non-negative curvature} if $P$ contracts the Lipschitz norm, \ie
\begin{equation*}
\|Pf\|_{\lip} \le \|f\|_{\lip},
\end{equation*}
where $\|f\|_{\lip}:=\max_{y\sim x }{|f(y)-f(x)|}$.
This fundamental property is satisfied by many natural families of graphs, including all Abelian Cayley graphs and, more generally, all Cayley graphs whose generating set is conjugacy-invariant.
Additional details, including a more effective formulation in terms of couplings, will be provided in the next section when we discuss curvature for general Markov chains.

\subsubsection*{Expansion.} Our first result concerns the expansion of graphs with non-negative curvature. Write $\partial A$ for the edge-boundary of a set $A\subseteq V$, and $\deg(A)$ for the sum of the degrees of all vertices in $A$. With this notation, the \emph{conductance} (also known as Cheeger constant, or bottleneck ratio) of $G$ is
\begin{equation*}
\Phi := \min\Bigl\{\frac{|\partial A|}{\deg(A)}\colon A\subseteq V,0<\deg(A)\le |E|\Bigr\}.
\end{equation*}
Sequences of bounded-degree graphs whose size diverges but whose conductance remains bounded away from zero are famously known as \emph{expanders}. Whether such graphs can have non-negative curvature is an important question, which explicitly appears in a survey by Ollivier \cite[Probl.\,T]{ollivier2010survey}, and is therein attributed to Milman and Naor. The problem has remained open until very recently, when a negative answer was given by the second author \cite{salez2021sparse}. Specifically, the latter used the notion of entropy for graph limits to prove that non-negative curvature and expansion are incompatible “at infinity”, and the conclusion was then transferred to finite graphs using a compactness argument. A clear drawback of this approach is its non-quantitative nature. In particular, the second author asked for a direct, quantitative relation between volume, degree and expansion on non-negatively curved graphs. This is precisely the content of our first main result.
\begin{theorem}[Poor expansion]\label{th:nonexp}If $G$ has non-negative curvature, then
\begin{equation*}
\Phi \le c\sqrt{\frac{d\log d}{\log n}},
\end{equation*}
where $n$ is the number of vertices, $d$ the maximum degree, and $c$ a universal constant.
\end{theorem}
In other words, large graphs can not simultaneously enjoy non-negative curvature and uniform expansion unless their maximum degree grows at least like $\log n/\log\log n$. We note that this is sharp up to the $\log\log n$ correction. Indeed, a celebrated result of Alon and Roichman asserts that random Cayley graphs with logarithmic degrees have uniform expansion with high probability \cite{MR1262979}, and specializing this result to random Cayley graphs of Abelian groups produces examples of non-negatively curved graphs with logarithmic degrees and uniform expansion.

\subsubsection*{Mixing times.} Our second result is a complete determination of the order of magnitude of the mixing time of all vertex-transitive graphs with bounded degrees and non-negative curvature. Suppose that $G$ is vertex-transitive, with degree $d$ and volume $n$. Fix an arbitrary origin $x\in V$ (the choice is irrelevant, by transitivity), and consider the lazy simple random walk on $G$ started at $x$, \ie the Markov chain $(X_t)_{t\ge 0}$ on $V$ with initial condition $X_0=x$ and transition matrix $(P+I)/2$. The \emph{mixing time} of $G$ is a fundamental graph-theoretical parameter, defined as follows \cite{MR3726904}:
\begin{equation*}
\tmix := \min\Bigl\{t\in\dN\colon \max_{A\subseteq V}\Bigl|\PP(X_t\in A)-\frac{|A|}{n}\Bigr|\le \frac 14\Bigr\}.
\end{equation*}
An important, closely related quantity is the so-called \emph{relaxation time}
\begin{equation*}
\trel := \frac{1}{1-\lambda_{2}},
\end{equation*}
where $1=\lambda_1>\lambda_2\ge\ldots\ge\lambda_n$ denote the ordered eigenvalues of $P$. It is classical that $\tmix \ge \trel$, and that this inequality can be off by a factor as large as $\log n$, as is the case for expanders (see \cite{MR3726904}).

\begin{theorem}[Mixing times]\label{th:graphmix}All vertex-transitive graphs with non-negative curvature satisfy
\begin{equation*}
\tmix \ \asymp_d \ \trel \ \asymp_d \ \frac{1}{\Phi^2},
\end{equation*}
where the notation $a\asymp_d b$ means that the ratio $a/b$ is bounded from above and below by positive constants that depend only on the degree $d$.
\end{theorem}
This has the following remarkable consequence. For a sequence of graphs $(G_n)_{n\ge 1}$, the condition
\begin{equation*}
\frac{\tmix(G_n)}{\trel(G_n)} \xrightarrow[n\to\infty]{} +\infty
\end{equation*}
is known as the \emph{product condition}. It is well-known to be necessary (see \cite[Prop.\,18.4]{MR3726904}) for the occurrence of the so-called \emph{cutoff phenomenon}, a celebrated but still mysterious phase transition in the approach to equilibrium of certain Markov chains (see \cite[Chap.\,18]{MR3726904} for the precise definition). Thus, Theorem \ref{th:graphmix} implies that vertex-transitive graphs with fixed degree and non-negative curvature never exhibit cutoff. This stands in stark contrast with recent results due to the second author, showing that many non-negatively curved graphs with logarithmic degree do exhibit cutoff \cite{salez2021cutoff}. Interestingly, the conclusion of Theorem \ref{th:graphmix} is known to hold for fixed-degree Cayley graphs of \emph{moderate growth} \cite{MR1254308}. This geometric condition was later shown to be equivalent to the much simpler requirement that the diameter is algebraically large in the volume~\cite{MR3439705} (see the recent paper \cite{MR4253426} for an extension to vertex-transitive graphs). This raises the following question. A positive answer would be surprisingly strong, but we have not been able to produce any counter-example.
\begin{question}[Moderate growth?]Do all non-negatively curved graphs with degree at most $d$ satisfy
\begin{equation*}
\diam(G) \ge \varepsilon_d n^{\varepsilon_d},
\end{equation*}
where $n$ is the number of vertices, and $\varepsilon_d>0$ a constant depending only on $d$?
\end{question}

Indeed, this question was answered affirmatively in case of a modified Barky Emery curvature dimension condition in \cite{bauer2015li}, and later by the first author in case of the weaker, unmodified Bakry Emery curvature dimension condition \cite{munch2019li}.

\subsubsection*{Diffusivity.}Finally, our last result concerns the speed of random walk on vertex-transitive graphs with non-negative curvature. Many infinite graphs such as the line $\dZ$ are known to exhibit a \emph{diffusive} behavior, in the sense that the typical graph distance between $X_t$ and $X_0$ grows like $\sqrt{t}$. On a finite graph, the distance to the starting point can of course no longer grow indefinitely with time, but one may still hope for a diffusive behavior on \emph{appropriate time-scales}. This vague statement was recently given a powerful rigorous content by Lee and Peres \cite{MR3127886}, who showed that the simple random walk on any finite vertex-transitive graph satisfies the diffusive lower-bound
\begin{equation*}
\EE \bigl[\dist(X_0,X_t) \bigr] \ge c\sqrt{\frac{t}{d}}
\end{equation*} for all $t\in[d,\trel]$, where $c>0$ is a universal constant. The graph $\dZ_2^{d}\times \dZ_n$ shows that this lower-bound is sharp. However, the authors conjectured that the time-scale on which the diffusive behavior remains valid should actually be much longer, namely, of order $\tmix$ \cite[Conj.\,2.5]{MR3127886}.
Our second result confirms this prediction in the case of non-negatively curved graphs.
\begin{theorem}[Diffusive lower-bound]\label{th:speed}If $G$ is vertex-transitive and non-negatively curved, then
\begin{equation*}
\EE \bigl[\dist(X_0,X_t) \bigr] \ge c\sqrt{\frac{t}{d}},
\end{equation*}
for $t\in[d,\tmix]$, where $c$ is a universal constant.
\end{theorem}

We emphasize that our estimates are not restricted to simple random walks on graphs. Analogous results will be stated for general Markov chains with non-negative curvature. In particular, neither reversibility, nor even the symmetry of the support of $P$ are actually required for a version of Theorem \ref{th:nonexp} to hold.
Ollivier curvature with respect to a directed metric has been explored before in \cite{yamada2016ricci,ozawa2020geometric,ozawa2022heat,
eidi2020ollivierJournal}. However the specific consequences of non-negative curvature seem to be unexplored
to the best of our knowledge. Our general results are exposed in Section \ref{sec:main} below, and are proved in Section \ref{sec:proofs}.

\section{Main results}
\label{sec:main}

In the remainder of the paper, we consider an arbitrary, irreducible stochastic matrix $P$ on a finite state space $V$. A natural measure of the ``distance'' from a state $x\in V$ to a state $y\in V$ is the minimum number of transitions needed for the chain to move from $x$ to $y$, namely
\begin{equation*}
\dist(x,y) := \min\{k\in\dN\colon P^k(x,y)>0\}.
\end{equation*}
This quantity is not necessarily symmetric, but it clearly satisfies the two other axioms of a distance. We may then use optimal transport to extend this notion to probability measures as follows: write $\cP(V)$ for the set of probability measures on $V$, and define $W\colon \cP(V)\times\cP(V)\to\dR_+$ by
\begin{equation*}
W(\mu,\nu) := \inf_{\substack{X\sim\mu\\Y\sim\nu}}
\EE\bigl[\dist(X,Y)\bigr],
\end{equation*}
where the infimum runs over all possible random pairs $(X,Y)$ whose marginals are $\mu$ and $\nu$. Again, this quantity is not necessarily symmetric, but it always satisfies the two other axioms of a distance.
Due to Kantorovich duality \cite[Th.\,5.10 \& Particular Case 5.4]{villani2009optimal}, we can write
\[
W(\mu,\nu) = \sup \left\{ \nu f - \mu f : \|f\|_{\lip} \leq 1 \right\}
\]
with
\[
\|f\|_\lip := \sup_{y\sim x} f(y)-f(x).
\]
Finally, we say that $P$ has \emph{non-negative curvature} if it is a contraction under $W$, \ie
\begin{equation}
\label{def:curvature}
\forall \mu,\nu\in\cP(V),\quad
W(\mu P,\nu P) \le W(\mu,\nu).
\end{equation}
Ollivier curvature with a non-symmetric distance has been studied in \cite{yamada2016ricci,ozawa2020geometric,eidi2020ollivierJournal,ozawa2022heat}.
Due to Kantorovich duality and as in the introduction, non-negative curvature is equivalent to
$
\|Pf\|_\lip \leq \|f\|_\lip.
$
By convexity, it is in fact sufficient to check property \eqref{def:curvature} on Dirac masses $\mu=\delta_x,\nu=\delta_y$, $x,y\in V$. Moreover, by the triangle inequality, we may further restrict our attention to the case where $y$ is a neighbor of $x$ (by which we mean that $\dist(x,y)=1$ and which we denote by $y\sim x$), \ie
\begin{equation}
\label{def:local}
\forall y\sim x, \quad
W\left(P(x,\cdot),P(y,\cdot)\right) \le 1.
\end{equation}
This local condition is easily verified in practice. For example, it holds for random walks on Abelian groups and, more generally, random walks with a conjugacy-invariant support, as we now explain.
\begin{example}[Random walks on groups] Suppose that $V$ is a group, and fix $\mu\in\cP(V)$. By definition, the \emph{random walk} on $V$ with increment distribution $\mu$ is the Markov chain whose transitions correspond to left-multiplication by a $\mu-$distributed element, \ie
$
P(x,y) := \mu(y x^{-1}).
$
This chain has non-negative curvature as soon as the set $S:=\{z\in V\colon\mu(z)>0\}$ is \emph{conjugacy-invariant}, \ie
\begin{equation}
\label{assume:conjugacy}
\forall z\in V,\quad z S z^{-1} = S.
\end{equation}
Indeed, this assumption implies that $\dist(zx,zy)=\dist(x,y)$ for all $x,y,z\in V$. In~particular, if $Z$ denotes a random variable with law $\mu$, then the ``obvious'' coupling of $P(x,\cdot)$ and $P(y,\cdot)$ given by $X:=Zx$ and $Y:=Zy$ verifies \eqref{def:local}. Note that the condition \eqref{assume:conjugacy} trivially holds if the group is Abelian. An emblematic non-Abelian example is the transposition walk on the symmetric group \cite{MR3936154}.
\end{example}

To avoid periodicity issues, we now assume that $P$ is lazy, \ie $P(x,x)\ge 1/2$ for all $x\in V$. This is more than enough to guarantee that the chain mixes, \ie
\begin{equation*}
\forall x,y\in V,\quad P^t(x,y) \xrightarrow[t\to\infty]{} \pi(y),
\end{equation*}
where $\pi=\pi P$ denotes the unique invariant distribution. Quantifying the speed at which this convergence to equilibrium occurs is a fundamental question, with many applications \cite{MR3726904,MR2341319}. Formally, this amounts to estimating the so-called \emph{mixing time}:
\begin{equation*}
\tmix := \min\{t\ge 0\colon \dtv(t)\le \sfrac{1}{4}\}, \quad\textrm{ where }\quad \dtv(t) := \max_{x\in V}\bigl\|P^t(x,\cdot)-\pi\bigr\|_{\tv}.
\end{equation*}
Here $\|\mu-\nu\|_{\tv}$ denotes the \emph{total-variation distance} between $\mu,\nu\in\cP(V)$, defined as
\begin{equation*}
\left\|\mu-\nu\right\|_{\tv} = \max_{A\subseteq V}|\mu(A)-\nu(A)| =\frac{1}{2}\sum_{x\in V}|\mu(x)-\nu(x)| =\inf_{\substack{X\sim \mu\\Y\sim\nu}}\PP(X\ne Y),
\end{equation*}
where the infimum in the last expression runs over all possible couplings $(X,Y)$ of $\mu$ and $\nu$. Thus, a natural way to estimate mixing times is to exhibit good couplings, and this is precisely where curvature enters the play. Indeed, an elementary but crucial reformulation of the non-negative curvature assumption \eqref{def:curvature} is that the trajectories $(X_t)_{t\ge 0}$ and $(Y_t)_{t\ge 0}$ emanating from any two states $X_0=x$ and $Y_0=y$ can be coupled in such a way that their distance $t\mapsto \dist(X_t,Y_t)$ forms a super-martingale. When combined with an appropriate diffusive estimate for super-martingales, this observation turns out to imply the following $O(1/\sqrt{t})$ decay for the total-variation distance between the laws of $X_t$ and $Y_t$.
\begin{theorem}[Total-variation decay]\label{th:tvdecay}If $P$ is lazy and non-negatively curved, then,
\begin{equation*}
\left\|P^t(x,\cdot)-P^t(y,\cdot)\right\|_{\tv} \le \dist(x,y)\sqrt{\frac{10\,}{{(t+1)\pmin }}},
\end{equation*}
for all $x,y\in V$ and all $t\ge 0$, where $\pmin$ denotes the smallest non-zero entry of $P$.
\end{theorem}
Variants of this result have appeared in a number of works, under various forms \cite{MR2316551,lin2015equivalent,munch2019non,salez2021sparse}. However, all proofs use the fact that the increments of the process $t\mapsto \dist(X_t,Y_t)$ are uniformly bounded (by $2$), and this property may dramatically fail in our more general setup where the metric is directed. Nevertheless, the conclusion turns out to remain valid, and a proof is presented in Section \ref{sec:tvdecay}. The most ``obvious'' application of Theorem \ref{th:tvdecay} consists in taking a maximum over all states $x,y\in V$ to obtain the following mixing-time estimate, which is new in our directed setup.
\begin{corollary}[Diameter bound]\label{co:diam}If $P$ is lazy and non-negatively curved, then
\begin{equation*}
\tmix \le \frac{160\,(\diam)^2}{\pmin},
\end{equation*}
where $\diam:=\max_{x,y}\dist(x,y)$ denotes the diameter of the state space.
\end{corollary}
While interesting in its own right, this estimate is actually not the key to the new results mentioned in the Introduction. Our main finding is that a significantly finer estimate can be deduced from Theorem \ref{th:tvdecay} provided we replace the \emph{worst-case} mixing time by its \emph{average} version:
\begin{equation*}
\tmix^{\sharp} := \min\{t \ge 0\colon \dtv^\sharp(t)\le \sfrac 14\},\quad\textrm{ where }\quad \dtv^\sharp(t):=\sum_{x\in V}\pi(x)\bigl\|P^t(x,\cdot)-\pi\bigr\|_{\tv}.
\end{equation*}
\begin{remark}[Transitive chains]Obtaining a bound on $\tmix^\sharp$ rather than $\tmix$ is not a huge drawback. For example, we have $\tmix^\sharp=\tmix$ for all random walks on groups and, more generally, for all \emph{transitive} chains ($P$ is transitive if for each $x,y \in V$, there is a bijection $f\colon V\to V$ which maps $x$ to $y$ and preserves the transition kernel, \ie $P(f(u),f(v))=P(u,v)$ for all $u,v \in V$).
\end{remark}
Throughout the paper, we let $X=(X_t)_{t\ge 0}$ denote a Markov chain with transition matrix $P$ starting from stationarity ($X_0\sim \pi$). Our main new estimate on $\tmix^\sharp$ depends on two statistics of this chain. The first one is the \emph{mean displacement} in $t$ steps: \begin{equation*}
\EE \bigl[\dist(X_0,X_t) \bigr] = \sum_{x,y\in V}\pi(x)P^t(x,y)\dist(x,y).
\end{equation*}
The second is the \emph{escape probability} in $t$ steps, \ie the conductance of $P^t$:
\begin{align*}
\Phi(P^t) & = \min\Bigl\{\PP\left(X_t\notin A|X_0\in A\right)\colon A\subseteq V,0<\pi(A)\le \frac 12 \Bigr\}\\
& = \min\biggl\{\frac{1}{\pi(A)}\sum_{x\in A}\sum_{y\in A^c}\pi(x)P^t(x,y)\colon A\subseteq V,0<\pi(A)\le \frac 12\biggr\}.
\end{align*}
\begin{theorem}[Main estimate]\label{th:main} If $P$ is lazy and non-negatively curved, then
\begin{equation}
\label{infimum}
\tmix^\sharp \le \frac{160}{\pmin}\,\inf_{t\ge 1} \Bigl\{\frac{\EE[\dist(X_0,X_t)]}{\Phi(P^t)} \Bigr\}^2,
\end{equation}
where we recall that $\pmin$ denotes the smallest non-zero entry of $P$.
\end{theorem}

Theorem \ref{th:main} has a number of notable consequences, which we now enumerate. The simplest one is an ``average'' version of Corollary \ref{co:diam}, obtained by sending $t\to\infty$ in the infimum \eqref{infimum}:
\begin{equation}
\label{effectivediam}
\tmix^\sharp \le \frac{640\,(\diam^\sharp)^2}{\pmin},
\end{equation}
where $
\diam^\sharp := \sum_{x,y\in V}\pi(x)\pi(y)\dist(x,y)
$ denotes the \emph{effective diameter}. Note that the latter can be significantly smaller than the true diameter appearing in Corollary~\ref{co:diam} (consider, e.g., the biased random walk on a segment). A much more refined consequence of Theorem \ref{th:main} is obtained by taking $t=1$ in the infimum \eqref{infimum}: writing $\Phi=\Phi(P)$, we readily obtain the following surprising bound.
\begin{corollary}[Conductance bound]\label{co:conductance}If $P$ is lazy and non-negatively curved, then
\begin{equation*}
\tmix^\sharp \le \frac{40}{\pmin\Phi^2}.
\end{equation*}
\end{corollary}
This offers a considerable improvement over \eqref{effectivediam} in situations where the effective diameter diverges while the conductance remains bounded away from $0$ (consider, e.g., random walk on a random Abelian Cayley graph with logarithmic degree). More importantly, by virtue of an elementary combinatorial lower-bound on $\tmix^\sharp$ (see, e.g., \cite[\S 7.1.1]{MR3726904}), Corollary \ref{co:conductance} implies the quantitative non-existence of non-negatively curved expanders promised in Theorem \ref{th:nonexp}. For general chains, we will show that $\tmix^\sharp$ can be bounded below by $\diam^\sharp$, leading to the following result.
\begin{corollary}[Poor expansion]\label{co:expansion}If $P$ is non-negatively curved, then
\begin{equation*}
\Phi \le \frac{19}{\sqrt{\pmin\diam^\sharp}}.
\end{equation*}
\end{corollary}
Thus, non-negatively curved chains which are large ($\diam^\sharp\gg 1$) and sparse ($\pmin$ bounded away from $0$) must have poor expansion ($\Phi\ll 1$). This constitutes a precise quantitative answer to the Markov-chain generalization of the question of Milman, Naor and Ollivier \cite[Probl.\,T]{ollivier2010survey}. Note that there are examples of sparse chains with non-negative curvature and arbitrarily many states (consider, e.g., a biased random walk on a segment). However, the fact that their effective diameter is bounded forces their stationary measure to concentrate on a bounded number of states.

Corollary \ref{co:conductance} is sharp in the important case where $P$ is transitive, reversible and sparse. Indeed, we have the classical lower-bound $\tmix\ge\trel$, where $\trel:=(1-\lambda_2)^{-1}$ denotes the inverse spectral gap of $P$ (see, e.g., \cite{MR3726904}), and the first author established in \cite{munch2019non} the Buser inequality
\begin{equation*}
\trel \ge \frac{\pmin}{12\Phi^2},
\end{equation*}
for any non-negatively curved, reversible chain. When combined with Corollary \ref{co:conductance}, this yields the following result, of which Theorem \ref{th:graphmix} is clearly a special case.
\begin{corollary}[No cutoff for sparse chains]\label{co:cutoff}Fix $p\in(0,1)$. Then, any lazy reversible transitive chain with non-negative curvature and $\pmin\ge p$ satisfies
\begin{equation*}
\tmix \ \asymp_p \ \trel \ \asymp_p \ \frac{1}{\Phi^2},
\end{equation*}
where the notation $a\asymp_p b$ means that the ratio $a/b$ is bounded from above and below by positive constants that depend only on $p$. In particular, no family of such chains can exhibit cutoff.
\end{corollary}
An important observation here is that the transitivity of the chain is only used to ensure that $\tmix=\tmix^\sharp$. Consequently, Corollary \ref{co:cutoff} extends to any collection of chains which are ``spatially homogeneous'' in the mild sense that $\tmix\asymp\tmix^\sharp$.

Finally, a last notable consequence of Theorem \ref{th:main} is that the expected displacement of the chain over short time-scales is already substantial. More precisely, assuming that $P$ is reversible, we have
\begin{equation*}
\Phi(P^t) \ge \frac{1-\lambda_2^t}{2}\ge \frac{1-e^{-\sfrac{t}{\trel}}}{2},
\end{equation*}
and the right-hand side is at least $\psfrac{1-e^{-1}}{2}$ for all $t\ge \trel$, yielding the following estimate.

\begin{corollary}[Fast escape]\label{co:escape}If $P$ is lazy, reversible and non-negatively curved, then
\begin{equation*}
\forall t\ge\trel,\quad \EE \bigl[\dist(X_0,X_t) \bigr] \ge \frac{\sqrt{\tmix^\sharp\pmin}}{41},
\end{equation*}
where we recall that $\EE \bigl[\dist(X_0,X_t) \bigr]=\sum_{x,y}\pi(x)P^t(x,y)\dist(x,y)$.
\end{corollary}
For reversible transitive chains, Lee and Peres \cite{MR3127886} proved the diffusive lower-bound
\begin{equation*}
\EE \bigl[\dist(X_0,X_t) \bigr] \ge c\sqrt{t\pmin},
\end{equation*} for all $t$ such that $\pmin^{-1}\le t\le \trel$, where $c>0$ is a universal constant. They conjectured that this diffusive lower-bound should remain valid until the mixing time \cite[Conj.\,2.5]{MR3127886}. Corollary \ref{co:escape} readily implies that this is true in the non-negatively curved case, and Theorem \ref{th:speed} follows as a special case.

\section{Proofs}\label{sec:proofs}
Section \ref{sec:conductance} below is devoted to the proof of our main result, namely the relation between conductance, displacement and mixing times (Theorem \ref{th:main}). The latter exploits the diffusive total-variation decay of non-negatively curved chains (Theorem~\ref{th:tvdecay}), which will be proved independently in Section \ref{sec:tvdecay}.
Once Theorem \ref{th:main} is established, all announced corollaries follow effortlessly, except for Corollary~\ref{co:expansion}: the latter requires a lower bound on the average mixing time in terms of the effective diameter, which we prove in Section \ref{sec:diameter}.

\subsection{Mixing time vs. conductance}
\label{sec:conductance}

In this section, we prove Theorem \ref{th:main}.
We will make crucial use of Theorem \ref{th:tvdecay}, as well as the following $L^1$ version of Cheeger's inequality. An important remark is that the latter holds without any assumption on the transition matrix $P$, and will thus also apply to powers of $P$.
\begin{lemma}[$L^1$ analogue of Cheeger's inequality]\label{lm:L1} If $f\colon V\to\dR$ satisfies $\pi f=0$, then,
\begin{equation*}
\sum_{x\in V}\pi(x)\left|f(x)\right| \le \frac{1}{\Phi(P)}\sum_{x,y\in V}\pi(x)P(x,y)\left|f(y)-f(x)\right|.
\end{equation*}
\end{lemma}
\begin{proof} Upon replacing $f$ with $-f$, we may assume that $\pi(f\ge 0)\ge 1/2$. For any $t\ge 0$, we may take $A=\{f\ge t\}$ in the definition of $\Phi(P)$ to obtain
\begin{equation*}
\Phi(P) \sum_{x\in V}\pi(x){\bf 1}_{(f(x)\ge t)} \le \sum_{x,y\in V}\pi(x)P(x,y){\bf 1}_{(f(y)\le t<f(x))}.
\end{equation*}
Integrating over $t\in\dR_+$ and interchanging the sum and integral, we obtain
\begin{equation*}
\Phi(P)\sum_{x\in V}\pi(x)f_+(x) \le \sum_{x,y\in V}\pi(x)P(x,y)(f_+(x)-f_+(y))_+,
\end{equation*}
where $a_+:=\max(0,a)$ denotes the positive part of $a$. Now, since $f$ is centered under~$\pi$, the left-hand side does not change if we replace $f_+(x)$ by $|f(x)|/2$. Similarly, since any gradient is centered under the measure $(x,y)\mapsto \pi(x)P(x,y)$, the right-hand side does not change if we replace $\left(f_+(x)-f_+(y)\right)_+$ by $\left|f_+(x)-f_+(y)\right|/2$. Finally, observe that $\left|f_+(x)-f_+(y)\right| \le|f(x)-f(y)|$.
\end{proof}
\begin{proof}[Proof of Theorem \ref{th:main}]Fix $s\ge 1$. Lemma \ref{lm:L1} applied to $P^s$ instead of $P$ gives
\begin{equation*}
\sum_{x\in V}\pi(x)\left|f(x)\right| \le \frac{1}{\Phi(P^s)}\sum_{x,y\in V}\pi(x)P^s(x,y)|f(x)-f(y)|,
\end{equation*}
for any centered observable $f\colon V\to\dR$. Now, fix $t\in\dN$ and $z\in V$, and let us apply this to the observable $f(x):=P^t(x,z)-\pi(z)$, which is centered because $\pi P^t=\pi$. We readily obtain
\begin{equation*}
\sum_{x\in V}\pi(x)\left|P^t(x,z)-\pi(z)\right| \le \frac{1}{\Phi(P^s)}\sum_{x,y\in V}\pi(x)P^s(x,y)\bigl|P^t(x,z)-P^t(y,z)\bigr|.
\end{equation*}
We may now sum over all $z\in V$ and use Theorem \ref{th:tvdecay} to get
\begin{align*}
\dtv^\sharp(t) & \le \frac{1}{\Phi(P^s)}\sum_{x,y\in V}\pi(x)P^s(x,y)\left\|P^t(x,\cdot)-P^t(y,\cdot)\right\|_{\textsc{tv}}\\
& \le \sqrt{\frac{10}{(t+1)\pmin}}\,\frac{\EE\left[\dist(X_0,X_s)\right]}{\Phi(P^s)}.
\end{align*}
Finally, choosing $t$ so that the right-hand side is smaller than $1/4$ shows that
\begin{equation*}
\tmix^\sharp \le \frac{160}{\pmin}\Bigl(\frac{\EE\left[\dist(X_0,X_s)\right]}{\Phi(P^s)} \Bigr)^2.
\end{equation*}
The result follows by taking an infimum over all $s\ge 1$.
\end{proof}

\subsection{Effective diameter vs. conductance}\label{sec:diameter}
Here we prove Corollary \ref{co:expansion}. We will use the following concentration inequality.
\begin{lemma}[Concentration inequality]\label{lm:concentration}For any $f\colon V\to\dR$ and $a>0$,
\begin{equation*}
\pi\left(f\ge \pi f+ a\right) \le \frac{1}{a\Phi}\min \Bigl(\max_{y\sim x}(f(y)-f(x))_+,\max_{y\sim x}(f(x)-f(y))_+ \Bigr),
\end{equation*}
and the same holds for $\pi(f\le \pi f- a)$.
\end{lemma}
We remark that if ``$\sim$'' is symmetric, then the minimum is the Lipschitz constant of $f$.

\begin{proof}
Upon replacing $f$ by $f-\pi f$ if necessary, we may assume that $f$ is centered under $\pi$, \ie $\pi f=0$. Now, we use Markov's inequality and Lemma \ref{lm:L1} to write
\begin{align*}
\pi\left(f\ge a\right) & \le \frac{1}{a}\sum_{x\in V}\pi(x)f_+(x) = \frac{1}{2a}\sum_{x\in V}\pi(x)|f(x)|\\
& \le \frac{1}{2a\Phi}\sum_{x,y\in V}\pi(x)P(x,y)\left|f(x)-f(y)\right|\\
& = \frac{1}{a\Phi}\sum_{x,y\in V}\pi(x)P(x,y)\left(f(x)-f(y)\right)_+\\
& \le \frac{1}{a\Phi}\max_{y\sim x}(f(x)-f(y))_+.
\end{align*}
Note that, since the function $(x,y)\mapsto f(x)-f(y)$ is centered under the measure $(x,y)\mapsto \pi(x)P(x,y)$, the integral of its positive part equals that of its negative part. This establishes the first claim, and the second is obtained by replacing $f$ with $-f$.
\end{proof}
We will use this lemma to prove the following lower-bound on the mixing time.
\begin{lemma}[Diameter lower-bound]\label{lm:diam}For any lazy chain $P$, we have
\[
\tmix^\sharp \ge \diam^\sharp-\frac{4}{\Phi}.
\]
\end{lemma}
\begin{proof}Let us first note that for a lazy chain, Lemma \ref{lm:concentration} holds with the better constant $2\Phi$ instead of $\Phi$ (just apply the lemma to the non-lazy chain $2P-I$). Now, fix $x\in V$ and $t\in\dN$, and write $B_x(t):=\{y\in V\colon \dist(x,y)\le t\}$. By definition,
\begin{align*}
\|P^t(x,\cdot)-\pi\|_\tv & = \max_{A\subseteq V}|P^t(x,A)-\pi(A)|\\
& \ge 1-\pi\left(B_x(t)\right)\\
& = 1-\PP \bigl(\dist(x,Y)\le t \bigr),
\end{align*}
where $Y$ denotes a $\pi-$distributed random variable. Averaging over $x\in V$, we obtain
\begin{equation*}
\dtv^\sharp(t) \ge 1-\PP\bigl(\dist(X,Y)\le t \bigr),
\end{equation*}
where $X$ is $\pi-$distributed and independent of $Y$. The claim follows if we can show
\begin{equation}
\label{toshow}
\PP \bigl(\dist(X,Y)\le \diam^\sharp-\sfrac{4}{\Phi} \bigr) \le \frac 12.
\end{equation}
Now, for fixed $x\in V$, the function $f\colon y\mapsto \dist(x,y)$ satisfies $f(z)\le f(y)+1$ whenever $z\sim y$, by the triangle inequality. Thus, Lemma \ref{lm:concentration} ensures that
\begin{equation*}
\PP\Bigl(\dist(X,Y)\le \EE \bigl[\dist(X,Y)|X \bigr]-\sfrac{2}{\Phi}\Bigr) \le \frac{1}{4}.
\end{equation*}
On the other hand, by the triangle inequality again, the function $f\colon x\mapsto \EE[\dist(x,Y)]$ satisfies $f(x)\le f(y)+1$ whenever $y\sim x$, and $\pi f=\EE[\dist(X,Y)]=\diam^\sharp$. Thus, Lemma \ref{lm:concentration} yields
\begin{equation*}
\PP\left(\EE \bigl[\dist(X,Y)|X \bigr]\le \diam^\sharp-\sfrac{2}{\Phi}\right) \le \frac{1}{4}.
\end{equation*}
Combining those two estimates readily yields \eqref{toshow}.
\end{proof}

\begin{proof}[Proof of Corollary \ref{co:expansion}]If $P$ is lazy and non-negatively curved, then Corollary \ref{co:conductance} and Lemma \ref{lm:diam} give
\begin{equation*}
\diam^\sharp \le \frac{40}{\Phi^2\pmin}+\frac{4}{\Phi} \le \frac{41}{\Phi^2\pmin},
\end{equation*}
because $\pmin,\Phi\le \frac 12$. If $P$ is not lazy, we apply the above result to $(P+I)/2$. The latter is still non-negatively curved, but its conductance and minimal entry are half those of $P$, so we loose a factor of $8$ and obtain
\begin{equation*}
\diam^\sharp \le \frac{328}{\Phi^2\pmin}.
\end{equation*}
This readily implies the claim, because $\sqrt{328}<19$.
\end{proof}
\subsection{Diffusive total-variation decay}
\label{sec:tvdecay}
In this section, we prove Theorem \ref{th:tvdecay}. Fix two distinct states $x\ne y$, and recall that
\begin{equation*}
W\left(P(x,\cdot),P(y,\cdot)\right) = \inf_{\chi}\Bigl\{\textstyle\sum_{u,v\in V}\chi(u,v)\,\dist(u,v)\Bigr\},
\end{equation*}
where the infimum runs over all probability distributions $\chi\in\cP(V^2)$ with marginals $P(x,\cdot)$ and $P(y,\cdot)$. Minimizers are called optimal couplings. As in \cite{MR2316551,munch2019non}, our first task consists in showing that they can be chosen so as to assign a ``decent'' probability to the ``good'' set
\begin{equation*}
\Gamma := \bigl\{(u,v)\in V^2\colon \dist(u,v)<\dist(x,y) \bigr\}.
\end{equation*}
\begin{lemma}[Good optimal couplings] \label{lm:coupling}If $P$ is lazy and $x\ne y$, then there is an optimal coupling $\chi$ of $P(x,\cdot),P(y,\cdot)$ such that $\chi\left(\Gamma\right) \ge \pmin.$
\end{lemma}
\begin{proof}
By compactness, we can find an optimal coupling $\chi$ which, among all optimal couplings, maximizes $\chi(\Gamma)$. Suppose for a contradiction that this ``doubly optimal'' coupling satisfies $\chi\left(\Gamma\right)< \pmin$. The set ${A}:=\{u\sim x\colon (u,y)\in \Gamma\}$ is not empty, since it contains the first vertex on a geodesic from $x$ to $y$. Thus,
$\chi(A\times V) = P(x,A) \ge \pmin>\chi(\Gamma)$.
This forces $\chi((A\times V)\setminus\Gamma)>0$, \ie
\begin{equation}
\label{exists:ab}\exists (x_0,y_0)\in (A\times V)\setminus\Gamma,\quad \chi(x_0,y_0) \ge \varepsilon,
\end{equation}
for some $\varepsilon>0$. On the other hand, we have
$
\chi(A\times\{y\}) + \chi(A^c\times\{y\}) = P(y,y)\ge\sfrac{1}{2}.
$
This forces $\chi(A^c\times\{y\}) > 0$, because $\chi(A\times\{y\})\le \chi(\Gamma)<\pmin\le \sfrac{1}{2}$. In other words,
\begin{equation}
\label{exists:z}
\exists x_1\in A^c,\quad \chi(x_1,y) \ge \varepsilon,
\end{equation}
provided $\varepsilon>0$ is chosen small enough. We now use the vertices $x_0,y_0,x_1$ found at \eqref{exists:ab}--\eqref{exists:z} to construct a new coupling $\widetilde{\chi}$ which contradicts the optimality of $\chi$. For all $(u,v)\in V^2$, we set
\begin{equation*}
\widetilde{\chi}(u,v) := 
\begin{cases}
\chi(u,v) & \textrm{if }u\notin\{x_0,x_1\}\textrm{ and }v\notin\{y_0,y\};\\
\chi(u,v)-\varepsilon &\textrm{if }(u,v)=(x_0,y_0)\textrm{ or }(u,v)=(x_1,y);\\
\chi(u,v)+\varepsilon & \textrm{if }(u,v)=(x_0,y)\textrm{ or }(u,v)=(x_1,y_0).
\end{cases}
\end{equation*}
By construction, $\widetilde{\chi}$ is non-negative and has the same marginals as $\chi$. Thus, it is a coupling of $P(x,\cdot)$ and $P(y,\cdot)$. This coupling is moreover optimal, since
\begin{align*}
 \sum_{u,v\in V}\dist(u,v)&\left(\widetilde{\chi}(u,v)-\chi(u,v)\right) \\[-8pt] & = \varepsilon\left(\dist(x_0,y)+\dist(x_1,y_0)-\dist(x_0,y_0)-\dist(x_1,y)\right)\\
& \le \varepsilon\left(\dist(x,y)-1+\dist(x_1,y_0)-\dist(x,y)-\dist(x_1,y)\right)\\
& \le 0,
\end{align*}
where we have successively used $x_0\in A$, $(x_0,y_0)\notin\Gamma$, and the triangle inequality $\dist(x_1,y_0)\le \dist(x_1,y)+\dist(y,y_0)$. Finally, $\Gamma$ contains $(x_1,y)$ but not $(x_0,y_0),(x_1,y)$, so we have
$
\widetilde{\chi}(\Gamma) \ge \chi(\Gamma)+\varepsilon,
$
contradicting the double optimality of $\chi$.
\end{proof}

Our second ingredient is a diffusive hitting-time estimate for super-martingales. Results of this sort are standard, but usually require a uniform bound on the increments, a property which fails in our directed setting ($\dist(x,x')=\dist(y,y')=1$ no longer implies $\dist(x',y')\le \dist(x,y)+2$).
\begin{lemma}[Hitting-time estimate]\label{lm:super}Let $(Z_t)_{t\ge 0}$ be a discrete-time, $\dN-$valued super-martingale with $Z_0=z_0\in\dN$, and set $\tau:=\min\{t\ge 0\colon Z_t= 0\}$. Assume that almost-surely,
\begin{equation*}
\PP\left(Z_{t+1}\ne Z_t | \cF_t\right) \ge p {\bf 1}_{(\tau>t)},
\end{equation*}
for all $t\ge 0$, where $(\cF_t)_{t\ge 0}$ denotes the underlying filtration. Then, for all $t\ge 1$,
\begin{equation*}
\PP(\tau\ge t) \le z_0\sqrt{\frac{10}{pt}}.
\end{equation*}
\end{lemma}
\begin{proof}Our starting point is the following easily verified inequality: for all $x\in\dR$,
\begin{equation*}
e^{-\sfrac{x}{2}} \ge 1-\frac{x}{2} + \frac{1 \wedge x^2}{10}.
\end{equation*}
In particular, for all $t\in\dN$ and $\lambda\in[0,1]$, we have on the event $\{\tau>t\}$,
\begin{align*}
\EE\left[e^{-\frac{\lambda}{2}(Z_{t+1}-Z_t)}|\cF_t\right] & \ge \EE\Bigl[1-\frac{\lambda}{2}(Z_{t+1}-Z_t)+ \frac{1\wedge (\lambda Z_{t+1}-\lambda Z_t)^2}{10}\Big|\cF_t\Bigr]\\
& \ge 1+\frac{p\lambda^2}{10},
\end{align*}
where the second line uses our assumptions on $Z$. It inductively follows that for $t\in\dN$,
\begin{equation*}
\EE \Bigl[e^{-\frac{\lambda}{2} (Z_{t\wedge\tau}-Z_0)} \Bigl(1+\frac{p\lambda^2}{10} \Bigr)^{-t\wedge \tau} \Bigr] \ge 1.
\end{equation*}
Since $Z_{t\wedge\tau}-Z_0\ge -Z_0=-z_0$, we deduce that
\begin{equation*}
\EE \Bigl[\Bigl(1+\frac{p\lambda^2}{10} \Bigr)^{-t\wedge \tau} \Bigr] \ge e^{-\sfrac{\lambda z_0}{2}} \ge 1-\frac{\lambda z_0}{2}.
\end{equation*}
In particular,
\begin{align*}
{\lambda z_0} & \ge 2\EE \Bigl[1-\Bigl(1+\frac{p\lambda^2}{10} \Bigr)^{-t\wedge \tau} \Bigr]\\
& \ge 2 \Bigl(1-\Bigl(1+\frac{p\lambda^2}{10} \Bigr)^{-t} \Bigr)\PP(\tau\ge t)\\
& \ge 2 \Bigl(1-\Bigl(1+\frac{p t\lambda^2}{10} \Bigr)^{-1} \Bigr)\PP(\tau\ge t).
\end{align*}
The result now readily follows by choosing $\lambda= \sqrt{10/pt}$. Note that this choice satisfies $\lambda\in[0,1]$ only when $10/pt\le 1$, but the conclusion trivially holds when $10/pt> 1$.
\end{proof}
\begin{proof}[Proof of Theorem \ref{th:tvdecay}]For each $(x,y)\in V$, let $\chi_{x,y}$ denote an optimal coupling of $P(x,\cdot)$ and $P(y,\cdot)$ satisfying the condition in Lemma \ref{lm:coupling}, and let us define a transition matrix $K$ on $V^2$ by
\begin{equation*}
K((x,y),(u,v)) := \chi_{x,y}(u,v).
\end{equation*}
Finally, consider a Markov chain $(X_t,Y_t)_{t\ge 0}$ with transition matrix $K$, and let us use the notation $\EE_{(x,y)}[\cdot]$ to indicate that $(X_0,Y_0)=(x,y)$. By construction, we have for all $x,y\in V$,
\begin{align*}
\EE_{(x,y)} \bigl[\dist(X_{1},Y_{1}) \bigr] & = W\left(P(x,\cdot),P(y,\cdot)\right)\le \dist(x,y);\\
\PP_{(x,y)}\bigl[\dist(X_1,Y_1)<\dist(x,y)\bigr] & \ge \pmin{\bf 1}_{(x\ne y)}.
\end{align*}
By the Markov property, the first condition implies that the process $Z:=(Z_t)_{t\ge 0}$ defined by
$
Z_t := \dist(X_t,Y_t)
$
is a super-martingale with respect to the natural filtration $(\cF_t)_{t\ge 0}$ of $(X_t,Y_t)_{t\ge 0}$, and the second implies that $\PP(Z_{t+1}\ne Z_t|\cF_t)\ge\pmin{\bf 1}_{Z_t\ne 0}$. Thus, Lemma \ref{lm:super} applies and yields
\begin{equation*}
\PP_{x,y}(X_t\ne Y_t) \le \dist(x,y)\sqrt{\frac{10}{ (t+1)\pmin}}.
\end{equation*}
The result follows, since $\PP_{(x,y)}\left((X_t,Y_t)=(\cdot,\cdot)\right)$ is a coupling of $P^t(x,\cdot)$ and $P^t(y,\cdot)$.
\end{proof}

\backmatter
\bibliographystyle{jepplain+eid}
\bibliography{munch-salez}
\end{document}