diff --git a/.travis.yml b/.travis.yml
index c418edfd402cefe9a28ade11b39d08828ce3d903..44baae0044a93f0b62ffcafd97477eab17dd8cd6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,43 +7,44 @@ matrix:
 # this is the "master" one, it is going to update the manual every time
 # Variable PLUMED_ALL_TESTS=yes ensures that all the regtests can be run.
 # If some of them is not enabled, the whole suite will fail.
-  - os: linux
+  - name: Run all tests and build doc. All features enabled (system blas and lapack + asmjit)
+    os: linux
     dist: trusty
     sudo: required
     env: PLUMED_CC=mpicc PLUMED_CXX=mpic++ MAKEDOC=yes PLUMED_ALL_TESTS=yes LAPACK=yes ASMJIT=673dcefa
-# coverage scan
-  - os: linux
+  - name: Run all tests and scan coverage
+    os: linux
     dist: trusty
     sudo: required
     env: PLUMED_CC=mpicc PLUMED_CXX=mpic++ MAKECOVERAGE=yes PLUMED_ALL_TESTS=yes
-# the following are with debug flags
-  - os: linux
+  - name: Debug flags, no MPI
+    os: linux
     dist: trusty
     sudo: required
     env: PLUMED_CC=gcc   PLUMED_CXX=g++    CONFIG_FLAGS="--enable-debug --enable-debug-glibcxx"
-  - os: linux
+  - name: Debug flags, MPI
+    os: linux
     dist: trusty
     sudo: required
     env: PLUMED_CC=mpicc PLUMED_CXX=mpic++ CONFIG_FLAGS="--enable-debug --enable-debug-glibcxx"
-# cppcheck
-  - os: linux
+  - name: Cppcheck and codecheck
+    os: linux
     dist: trusty
     sudo: required
     env: CPPCHECK=yes CPPCHECK_VERSION=1.87
-# test using external blas with internal lapack
-  - os: linux
+  - name: External blas with internal lapack
+    os: linux
     if: branch =~ ^test- OR type IN(pull_request)
     dist: trusty
     sudo: required
     env: PLUMED_CC=mpicc PLUMED_CXX=mpic++  PLUMED_CXXFLAGS=-O3 LAPACK=yes CONFIG_FLAGS="--disable-external-lapack"
-# test using docker image.
-# currently it means ubuntu 17, with gcc6
-  - sudo: required
+  - name: Docker image (ubuntu 17, with gcc6)
+    sudo: required
     if: branch =~ ^test- OR type IN(pull_request)
     services: docker
     env: PLUMED_DOCKER=yes
-# osx serial
-  - os: osx
+  - name: MacOS, serial build
+    os: osx
     if: branch =~ ^test- OR type IN(pull_request)
     osx_image: xcode7.3
     env: PLUMED_CC=clang PLUMED_CXX=clang++ PLUMED_CXXFLAGS=-O3
@@ -51,8 +52,8 @@ matrix:
     cache:
       directories:
       - $HOME/.ccache
-# osx parallel
-  - os: osx
+  - name: MacOS, MPI build
+    os: osx
     if: branch =~ ^test- OR type IN(pull_request)
     osx_image: xcode7.3
     env: PLUMED_CC=mpicc PLUMED_CXX=mpic++  PLUMED_CXXFLAGS=-O3
@@ -60,24 +61,24 @@ matrix:
     cache:
       directories:
       - $HOME/.ccache
-# osx serial macports
-  - os: osx
+  - name: MacPorts, default modules
+    os: osx
     if: branch =~ ^test- OR type IN(pull_request)
     osx_image: xcode7.3
     env: PLUMED_MACPORTS="plumed" PLUMED_PYTHON="2.7 3.6 3.7"
     cache:
       directories:
       - $HOME/.macports-ci-ccache
-# osx serial macports debug variant
-  - os: osx
+  - name: MacPorts, all modules
+    os: osx
     if: branch =~ ^test- OR type IN(pull_request)
     osx_image: xcode7.3
     env: PLUMED_MACPORTS="plumed +allmodules" PLUMED_PYTHON="2.7 3.6 3.7"
     cache:
       directories:
       - $HOME/.macports-ci-ccache
-# osx serial macports debug variant
-  - os: osx
+  - name: MacPorts 8.3, all modules
+    os: osx
     osx_image: xcode8.3
     env: PLUMED_MACPORTS="plumed +allmodules" PLUMED_PYTHON="2.7 3.6 3.7"
     cache:
@@ -110,6 +111,7 @@ install:
   - export INCLUDE="$HOME/opt/include:$HOME/opt/arrayfire/include:$INCLUDE"
   - export LIBRARY_PATH="$HOME/opt/lib:$HOME/opt/arrayfire/lib:$LIBRARY_PATH"
   - export LD_LIBRARY_PATH="$HOME/opt/lib:$HOME/opt/arrayfire/lib:$LD_LIBRARY_PATH"
+  - export PYTHONPATH="$HOME/opt/lib/plumed/python:$PYTHONPATH"
 # Setting the TMPDIR in this way allegedly prevents problems with the compilation of 
 #Â PLUMED + Python on macos
   - export TMPDIR="/tmp" 
@@ -149,13 +151,14 @@ install:
 # install aspell
   - if test "$MAKEDOC" == yes ; then sudo apt-get install -y aspell aspell-en  ; fi
 # install lcov
-  - if test "$MAKECOVERAGE" == yes ; then ./.travis/install.lcov v1.13 ; fi
+  - if test "$MAKECOVERAGE" == yes ; then ./.travis/install.lcov v1.14 ; fi
 # install numpy and cython for python interface
 # only for linux and homebrew (no macports)
   - if test "$PLUMED_CXX" ; then
       sudo pip install --upgrade pip ;
       sudo pip install numpy ;
       sudo pip install Cython ;
+      sudo pip install nose ;
     fi
 # then replace doxygen with the desided version
 # I use 1.8.10 instead of 1.8.11 since it looks like 1.8.11 have troubles with
@@ -209,11 +212,11 @@ script:
 
 # build using macports
   - if test -n "$PLUMED_MACPORTS" ; then
-      sudo port -N -k install $PLUMED_MACPORTS ;
+      travis_wait sudo port -N -k install $PLUMED_MACPORTS ;
       plumed config show ;
       ./macports-ci ccache --save ;
       for p in $PLUMED_PYTHON ; do
-        sudo port -N install py${p//./}-plumed ;
+        travis_wait sudo port -N install py${p//./}-plumed ;
       done
     fi
 
@@ -236,13 +239,17 @@ script:
   - if test "$MAKECOVERAGE" == yes ; then make -C developer-doc coverage ; fi
   - if test "$MAKEDOC" == yes ; then make doc >/dev/null ; fi
   - if test "$PLUMED_CXX" ; then make -C regtest checkfail ; fi
+# OpenBLAS is needed for numpy, I install it separately since it takes a long time
   - if test -n "$PLUMED_MACPORTS" ; then
-      sudo port -N -d test $PLUMED_MACPORTS ;
+      travis_wait sudo port -N -d test $PLUMED_MACPORTS ;
+      travis_wait sudo port -N install OpenBLAS ;
       for p in $PLUMED_PYTHON ; do
-        sudo port install py${p//./}-nose ;
-        nosetests-$p -v -w python ;
+        travis_wait sudo port test py${p//./}-plumed ;
       done
     fi
+  - if test "$PLUMED_CXX" ; then
+        nosetests -v -w python ;
+    fi
 
 # CPPCHECK:
 # this is required so as to have all the include files inplace:
diff --git a/CHANGES/v2.3.md b/CHANGES/v2.3.md
index 996580b3149d3043d389ba42a82b3441c9637fc1..7b8160a8846776db22cbe43140d0aa4025f4a408 100644
--- a/CHANGES/v2.3.md
+++ b/CHANGES/v2.3.md
@@ -262,7 +262,7 @@ For users:
 - Fixed some openMP regression (some related to the whole codes and some specifics for Coordination and Multicolvar), this were compiler dependent so not all users may have experienced them
 - Fixed an issue with \ref CS2BACKBONE when more than 2 chains were used
 - Fixed memory leak in \ref RDC.
-- Fixed segmentation fault with more than two CVs in reweighting \ref METAD (see \issue{399}, thanks to fiskissimo).
+- Fixed segmentation fault with more than two CVs in reweighting \ref METAD (see \issue{399}, thanks to Fiskissimo).
 
 For developers:
 - Small fix in LDFLAGS when enabling coverage.
diff --git a/CHANGES/v2.4.md b/CHANGES/v2.4.md
index 284ebf66d09337061d4deea4c570c3ac613000cf..554f6458683248f00e89e780acfccd7b07ee711a 100644
--- a/CHANGES/v2.4.md
+++ b/CHANGES/v2.4.md
@@ -207,14 +207,16 @@ For users:
   - Fixed some performances regression issue with OpenMP
   - Updated NAMD patches to version 2.12 and 2.13. Old patches have been removed.
   - GROMACS patch for gromacs-2018.4.
-  - Fixed a threadsafety issue using forces on \ref HISTOGRAM 
+  - Fixed a thread safety issue using forces on \ref HISTOGRAM 
   - Fixed error message suggesting wrong actions (see \issue{421}).
 
 For developers:
   - All fixed done in version 2.3.8
   - CPPCHECK updated to 1.85
 
-## Version 2.4.5 (to be released)
+## Version 2.4.5 (Apr 1, 2019)
+
+\plumednotmaintained
 
 For users:
   - Fixed an inconsistency in parsing of braces.
@@ -222,4 +224,13 @@ For users:
     including spaces (e.g. with `FILE={/path with space/file}`). Notice 
     that this invalidates syntax such as `ATOMS={1}{2}{3}{4}`. See more
     at \issue{434}.
+  - Fixed \ref simplemd so as to call "runFinalJobs" at the end of the simulation.
+  - GROMACS patch for gromacs-2016.6.
+  - GROMACS patch for gromacs-2018.6.
+  - Added aliases for some actions/options containing dashes (`-`) in their name. This will improve
+    backward compatibility when these actions/options will be removed (see \issue{449}).
+
+## Version 2.4.6 ()
 
+For users:
+  - Fixed a bug in \ref COORDINATIONNUMBER where derivatives were wrong when using R_POWER > 2, thanks to @MoleOrbitalHybridAnalyst for spotting and fixing
diff --git a/CHANGES/v2.5.md b/CHANGES/v2.5.md
index c096912b6f6598e8e8be00e8ee20bf603652e169..a0c5d77808eb9e7167da2ea2f70a1266980c5d2e 100644
--- a/CHANGES/v2.5.md
+++ b/CHANGES/v2.5.md
@@ -13,8 +13,8 @@ Changes from version 2.4 which are relevant for users:
     to set correctly the `LD_LIBRARY_PATH` variable for the linux executable to work correctly. The procedure has been tested well on OSX and Linux,
     but could give problems on other platform. Please report possible problems on the mailing list.
   - \ref driver now stops correctly when using \ref COMMITTOR. If you want to continue the analysis, use the `NOSTOP` flag in \ref COMMITTOR.
-  - \ref METAD the calculation of the reweigthing factor is now activated by CALC_RCT instead of REWEIGHTING_NGRID and REWEIGHTING_NHILLS, the frequency of update can be set 
-    by RCT_USTRIDE, the default value is 1 and should be ok for most of the cases
+  - \ref METAD the calculation of the reweighting factor is now activated by CALC_RCT instead of REWEIGHTING_NGRID and REWEIGHTING_NHILLS, the frequency of update can be set 
+    by RCT_USTRIDE, the default value is 1 and should be OK for most of the cases
   - Fixed sign in Cartesian components of \ref PUCKERING with 6 membered rings (thanks to Carol Simoes and Javi Iglesias).
 
 - New actions:
@@ -62,7 +62,7 @@ Changes from version 2.4 which are relevant for users:
   - \ref SAXS has an additional implementation based on Bessel functions that can be faster for large systems (new keyword BESSEL)
   - \ref SAXS keyword SCEXP has been renamed into SCALEINT
   - \ref SAXS includes the MARTINI bead structure factors for Proteins and Nucleic Acids
-  - \ref SAXS includes a GPU implementation based on arrayfire (need to be linked at compile time) that can be activated with GPU
+  - \ref SAXS includes a GPU implementation based on ArrayFire (need to be linked at compile time) that can be activated with GPU
   - \ref METAINFERENCE and all related methods has a new keyword REGRES_ZERO to scale data using a linear scale fit
   - \ref CALIBER new bias to perform Maximum Caliber replica-averaged restrained simulations 
 
@@ -94,8 +94,8 @@ Changes from version 2.4 which are relevant for users:
   - Implemented bash autocompletion, see \ref BashAutocompletion.
   - \ref MOLINFO now allows selecting atoms from chains with a numeric ID (see \issue{320}).
   - Removed the patch for GMX 5.1.4
-  - LAMMPS patch has been finally removed. Notice that LAMMPS natively supports PLUMED now.
-  - AMBER patch has been finally removed. Notice that AMBER (sander module) natively supports PLUMED starting with version 15.
+  - LAMMPS patch has been finally removed. Notice that LAMMPS has native support for PLUMED now.
+  - AMBER patch has been finally removed. Notice that AMBER (sander module) has native support for PLUMED starting from version 15.
   - \ref RMSD calculation has been optimized. This should positively affect the performances of CVs where
      many RMSD values are computed on small groups of atoms, such as secondary structure variables.
   - In \ref METAD, when using a bias factor equal to one (no bias) the `rct` component is set to zero rather than to one.
@@ -152,12 +152,35 @@ Changes from version 2.4 which are relevant for developers:
   for testing, the default choice is the typical one used on the respective operating system.
 - On OSX, `plumed` and `libplumed.dylib` will find `libplumedKernel.dylib` using `@loader_path`.
 - Using CXX compiler to link the main program.
-- plumed can be compiled with arrayfire to enable for gpu code. \ref SAXS collective variable is available as part of the isdb module to provide an example of a gpu implementation for a CV
+- plumed can be compiled with ArrayFire to enable for gpu code. \ref SAXS collective variable is available as part of the isdb module to provide an example of a gpu implementation for a CV
 
 
-## Version 2.5.1 (to be released)
+## Version 2.5.1 (Apr 1, 2019)
 
 For users:
 - in \ref SAXS the keyword ADDEXP is removed. Furthemore, SAXS intensities are automatically normalised for I(0)=1, in case experimental data are provided, the intensity is rescaled with the intensity of the lowest q provided. As a consequence SCALEINT is only needed for additional adjustments.
 - gromacs patch updated to gromacs 2018.5
-- Fixed building of python interface on MacOS Mojave (see \issue{445}, thansk for Omar Valsson).
+- Fixed a bug in gromacs patch that was resulting in incorrect number of threads (0) set when not explicitly using `-ntomp` on the 
+  command line or setting `OMP_NUM_THREADS` (see \issue{446}). To apply this fix you need to re-patch gromacs.
+  Notice that setting the number of threads to zero might lead to inconsistent results when using secondary structure variables
+  or other multicolvars.
+- Fixed PLUMED so that when zero threads are selected from gromacs (see previous fix) the number of used threads is set to 1.
+  This fix allows to use a GROMACS executable patched with PLUMED 2.5.0 and linked at runtime with PLUMED 2.5.1 without introducing
+  errors. However, re-patching is preferred since it selectes the correct number of threads.
+- Python wrappers:
+  - Fixed building of python interface on MacOS Mojave (see \issue{445}, thanks to Omar Valsson).
+  - Numpy is not required anymore at build time (though it is required at runtime for our tests).
+  - Raw python arrays can be passed as an alternative to Numpy ndarrays.
+
+## Version 2.5.2 ()
+
+For users:
+- New shortcuts are available for selecting protein atoms: `@chi2-#`, `@chi3-#`,`@chi4-#` and `@chi5-#`
+- Fixed performance of \ref CUSTOM when having zero derivatives with respect to some arguments.
+- New --parse-only option in \ref driver to check the validity of a plumed input file
+- New patch for GROMACS 2019.2
+- Module VES: Fixed performance of \ref BF_CUSTOM for basis functions with linear terms (e.g. having zero derivatives). 
+- Python wrappers:
+  - Python module is now always named `plumed` irrespectively of program prefix and suffix. Notice 
+    that python module is installed inside the `lib/program_name` directory and thus it is not necessary to
+    use `program_name` in order to install multiple modules side by side.
diff --git a/VERSION b/VERSION
index a2d28c799d12c8cec17e7cea2c3b6d5b0cda4fe5..5f01ae601addf8b25118c1fd01eebff39d1d7514 100644
--- a/VERSION
+++ b/VERSION
@@ -5,4 +5,4 @@
 # (this is same as gromacs)
 # Notice that "plumed info --version" will return only 2.X
 # and "plumed info --long-version" will return the full string
-2.5.0
+2.5.1
diff --git a/configure b/configure
index 113b119d92e97f25d0cac838e2699058968a50ef..cafcbc3fd17b5c0643dd4a305f3aae44409ef635 100755
--- a/configure
+++ b/configure
@@ -8760,8 +8760,8 @@ done
   then
     { $as_echo "$as_me:${as_lineno-$LINENO}: Python executable is $PYTHON_BIN" >&5
 $as_echo "$as_me: Python executable is $PYTHON_BIN" >&6;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking support for required python modules (setuptools, cython, numpy, subprocess, os, shutil)" >&5
-$as_echo_n "checking support for required python modules (setuptools, cython, numpy, subprocess, os, shutil)... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking support for required python modules (setuptools, cython, subprocess, os, shutil)" >&5
+$as_echo_n "checking support for required python modules (setuptools, cython, subprocess, os, shutil)... " >&6; }
 testimport="
 from setuptools import setup
 from setuptools import Extension
@@ -8770,7 +8770,6 @@ import os
 import os.path
 import sys
 from shutil import copyfile
-import numpy
 from Cython.Build import cythonize
 "
     if echo "$testimport" | "$PYTHON_BIN" 1>/dev/null 2>/dev/null;  then
diff --git a/configure.ac b/configure.ac
index 73a717ce7621307381f933eaa44d1d0c12d877c2..582cb0da1607f3812a89fffd95a70f5a78c38dcb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -760,7 +760,7 @@ if test $python == true  ; then
   if test -n "$PYTHON_BIN"
   then
     AC_MSG_NOTICE([Python executable is $PYTHON_BIN])
-    AC_MSG_CHECKING([support for required python modules (setuptools, cython, numpy, subprocess, os, shutil)])
+    AC_MSG_CHECKING([support for required python modules (setuptools, cython, subprocess, os, shutil)])
 testimport="
 from setuptools import setup
 from setuptools import Extension
@@ -769,7 +769,6 @@ import os
 import os.path
 import sys
 from shutil import copyfile
-import numpy
 from Cython.Build import cythonize
 "
     if echo "$testimport" | "$PYTHON_BIN" 1>/dev/null 2>/dev/null;  then
diff --git a/macports/PortfilePython.in b/macports/PortfilePython.in
index 7260ee3ef55d81888615808f32638c2bddb23aa4..31d990ef64d457dbd670713a8e8df1860b034dee 100644
--- a/macports/PortfilePython.in
+++ b/macports/PortfilePython.in
@@ -6,7 +6,7 @@ PortGroup           python 1.0
 version             @_VERSION_@
 revision            @_REVISION_@
 name                py-plumed
-categories          science
+categories          python
 
 platforms           darwin
 categories-append   science
@@ -16,11 +16,14 @@ maintainers         {gmail.com:giovanni.bussi @GiovanniBussi} openmaintainer
 description         Python wrappers for plumed.
 long_description    ${description} They allow the plumed library to be directly used from python.
 
+homepage            http://www.plumed.org
 @_FETCH_@
 
 python.versions     27 36 37
 
-
+# when building from a git repository, we cannot set worksrcdir otherwise
+# the whole repo is cloned at that position. We thus explicitly set worksrcdir
+# at specific stages:
 pre-configure {
     worksrcdir ${distname}/python
 }
@@ -29,17 +32,25 @@ pre-test {
     worksrcdir ${distname}/python
 }
 
+# python setup is located in python subdir of plumed repository
+# worksrcdir ${distname}/python
+
 if {${name} ne ${subport}} {
 
+# setup the wrappers so that by default they load MacPorts plumed library
     build.env-append plumed_default_kernel=${prefix}/lib/libplumedKernel.dylib \
                      plumed_macports=yes
 
-    depends_build-append port:py${python.version}-cython
+    depends_build-append port:py${python.version}-cython \
+                         port:py${python.version}-setuptools
+
+    depends_lib-append   path:${prefix}/lib/libplumedKernel.dylib:plumed
 
-    depends_lib-append   port:py${python.version}-numpy \
-                         path:${prefix}/lib/libplumedKernel.dylib:plumed
+    depends_test-append port:py${python.version}-nose \
+                        port:py${python.version}-numpy
 
-    depends_test-append port:py${python.version}-nose
+    test.cmd            nosetests-${python.branch}
+    test.target
     test.run            yes
 }
 
diff --git a/patches/gromacs-2016.5.config b/patches/gromacs-2016.6.config
similarity index 100%
rename from patches/gromacs-2016.5.config
rename to patches/gromacs-2016.6.config
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2016.6.diff/src/gromacs/CMakeLists.txt
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/CMakeLists.txt
rename to patches/gromacs-2016.6.diff/src/gromacs/CMakeLists.txt
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2016.6.diff/src/gromacs/CMakeLists.txt.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/CMakeLists.txt.preplumed
rename to patches/gromacs-2016.6.diff/src/gromacs/CMakeLists.txt.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2016.6.diff/src/gromacs/mdlib/force.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/mdlib/force.cpp
rename to patches/gromacs-2016.6.diff/src/gromacs/mdlib/force.cpp
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2016.6.diff/src/gromacs/mdlib/force.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/mdlib/force.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/gromacs/mdlib/force.cpp.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/mdlib/minimize.cpp b/patches/gromacs-2016.6.diff/src/gromacs/mdlib/minimize.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/mdlib/minimize.cpp
rename to patches/gromacs-2016.6.diff/src/gromacs/mdlib/minimize.cpp
diff --git a/patches/gromacs-2016.5.diff/src/gromacs/mdlib/minimize.cpp.preplumed b/patches/gromacs-2016.6.diff/src/gromacs/mdlib/minimize.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/gromacs/mdlib/minimize.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/gromacs/mdlib/minimize.cpp.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/md.cpp b/patches/gromacs-2016.6.diff/src/programs/mdrun/md.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/md.cpp
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/md.cpp
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/md.cpp.preplumed b/patches/gromacs-2016.6.diff/src/programs/mdrun/md.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/md.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/md.cpp.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-2016.6.diff/src/programs/mdrun/mdrun.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/mdrun.cpp
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/mdrun.cpp
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-2016.6.diff/src/programs/mdrun/mdrun.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/mdrun.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/mdrun.cpp.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.cpp b/patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.cpp
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.cpp
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.cpp.preplumed b/patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.cpp.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.h b/patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.h
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.h
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.h
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.h.preplumed b/patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.h.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/repl_ex.h.preplumed
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/repl_ex.h.preplumed
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/runner.cpp b/patches/gromacs-2016.6.diff/src/programs/mdrun/runner.cpp
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/runner.cpp
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/runner.cpp
diff --git a/patches/gromacs-2016.5.diff/src/programs/mdrun/runner.cpp.preplumed b/patches/gromacs-2016.6.diff/src/programs/mdrun/runner.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2016.5.diff/src/programs/mdrun/runner.cpp.preplumed
rename to patches/gromacs-2016.6.diff/src/programs/mdrun/runner.cpp.preplumed
diff --git a/patches/gromacs-2018.5.config b/patches/gromacs-2018.6.config
similarity index 100%
rename from patches/gromacs-2018.5.config
rename to patches/gromacs-2018.6.config
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2018.6.diff/src/gromacs/CMakeLists.txt
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/gromacs/CMakeLists.txt
rename to patches/gromacs-2018.6.diff/src/gromacs/CMakeLists.txt
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2018.6.diff/src/gromacs/CMakeLists.txt.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/gromacs/CMakeLists.txt.preplumed
rename to patches/gromacs-2018.6.diff/src/gromacs/CMakeLists.txt.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp
similarity index 99%
rename from patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp
rename to patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp
index 07e1337af3e096184aa387b5d6caf66748fca510..6279fa85dab4112c1d627bbd8128591e16d12c7e 100644
--- a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp
+++ b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -824,7 +824,7 @@ void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda
                 enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
             }
 
-            if (j == efptMASS)
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
             {
                 enerpart_lambda += dlam*enerd->term[F_DKDL];
             }
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp.preplumed
similarity index 99%
rename from patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp.preplumed
index c939c4e8a7f13553ec52fde621a8812dc35727b7..4332b3f99d31ab5ebf6f5cd66f1b321d73d66c87 100644
--- a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/force.cpp.preplumed
+++ b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/force.cpp.preplumed
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -810,7 +810,7 @@ void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda
                 enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
             }
 
-            if (j == efptMASS)
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
             {
                 enerpart_lambda += dlam*enerd->term[F_DKDL];
             }
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/minimize.cpp b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/minimize.cpp
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/gromacs/mdlib/minimize.cpp
rename to patches/gromacs-2018.6.diff/src/gromacs/mdlib/minimize.cpp
diff --git a/patches/gromacs-2018.5.diff/src/gromacs/mdlib/minimize.cpp.preplumed b/patches/gromacs-2018.6.diff/src/gromacs/mdlib/minimize.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/gromacs/mdlib/minimize.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/gromacs/mdlib/minimize.cpp.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/md.cpp b/patches/gromacs-2018.6.diff/src/programs/mdrun/md.cpp
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/md.cpp
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/md.cpp
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/md.cpp.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/md.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/md.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/md.cpp.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-2018.6.diff/src/programs/mdrun/mdrun.cpp
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/mdrun.cpp
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/mdrun.cpp
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/mdrun.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/mdrun.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/mdrun.cpp.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.cpp b/patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.cpp
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.cpp
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.cpp
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.cpp.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.cpp.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.h b/patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.h
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.h
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.h
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.h.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.h.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/repl_ex.h.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/repl_ex.h.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/runner.cpp b/patches/gromacs-2018.6.diff/src/programs/mdrun/runner.cpp
similarity index 99%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/runner.cpp
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/runner.cpp
index 25a8230eb42715d61e272b2852b176ef3d812a33..f8a336a638e6e3c0f11e89df143be3730b159b82 100644
--- a/patches/gromacs-2018.5.diff/src/programs/mdrun/runner.cpp
+++ b/patches/gromacs-2018.6.diff/src/programs/mdrun/runner.cpp
@@ -1342,7 +1342,10 @@ int Mdrunner::mdrunner()
           /* detect plumed API version */
           int pversion=0;
           plumed_cmd(plumedmain,"getApiVersion",&pversion);
-          if(pversion>5) plumed_cmd(plumedmain,"setNumOMPthreads",&hw_opt.nthreads_omp);
+          if(pversion>5) {
+             int nth = gmx_omp_nthreads_get(emntDefault);
+             if(pversion>5) plumed_cmd(plumedmain,"setNumOMPthreads",&nth);
+          }
         }
         /* END PLUMED */
 
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/runner.cpp.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/runner.cpp.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/runner.cpp.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/runner.cpp.preplumed
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/runner.h b/patches/gromacs-2018.6.diff/src/programs/mdrun/runner.h
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/runner.h
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/runner.h
diff --git a/patches/gromacs-2018.5.diff/src/programs/mdrun/runner.h.preplumed b/patches/gromacs-2018.6.diff/src/programs/mdrun/runner.h.preplumed
similarity index 100%
rename from patches/gromacs-2018.5.diff/src/programs/mdrun/runner.h.preplumed
rename to patches/gromacs-2018.6.diff/src/programs/mdrun/runner.h.preplumed
diff --git a/patches/gromacs-2019.2.config b/patches/gromacs-2019.2.config
new file mode 100644
index 0000000000000000000000000000000000000000..4348d93fd2f9c9db2567ee332f7ba123c58b7abc
--- /dev/null
+++ b/patches/gromacs-2019.2.config
@@ -0,0 +1,33 @@
+
+
+function plumed_preliminary_test(){
+# check if the README contains the word GROMACS and if gromacs has been already configured
+  grep -q GROMACS README 1>/dev/null 2>/dev/null
+}
+
+function plumed_patch_info(){
+cat << EOF
+PLUMED can be incorporated into gromacs using the standard patching procedure.
+Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
+
+On clusters you may want to patch gromacs using the static version of plumed, in this case
+building gromacs can result in multiple errors. One possible solution is to configure gromacs
+with these additional options:
+
+cmake -DBUILD_SHARED_LIBS=OFF -DGMX_PREFER_STATIC_LIBS=ON
+
+To enable PLUMED in a gromacs simulation one should use
+mdrun with an extra -plumed flag. The flag can be used to
+specify the name of the PLUMED input file, e.g.:
+
+gmx mdrun -plumed plumed.dat
+
+For more information on gromacs you should visit http://www.gromacs.org
+
+EOF
+}
+
+plumed_before_patch(){
+  plumed_patch_info
+}
+
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e010a571882d91e7e92a10e29a76328ac0f2bd0d
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt
@@ -0,0 +1,448 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+add_library(libgromacs_external OBJECT "")
+if(CMAKE_COMPILER_IS_GNUCXX)
+    # Keep quiet about e.g. linearalgebra module
+    target_compile_options(libgromacs_external PRIVATE ${CXXFLAGS_NO_STRINGOP_TRUNCATION})
+endif()
+
+add_library(libgromacs_generated OBJECT "")
+if (BUILD_SHARED_LIBS)
+    set_target_properties(libgromacs_external PROPERTIES POSITION_INDEPENDENT_CODE true)
+    set_target_properties(libgromacs_generated PROPERTIES POSITION_INDEPENDENT_CODE true)
+endif()
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrun)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(restraint)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+add_subdirectory(compat)
+add_subdirectory(mimic)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+target_sources(libgromacs_external PRIVATE ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
+gmx_configure_version_file(
+    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH
+    EXTRA_VARS
+        GMX_SOURCE_DOI
+    )
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE}
+     $<TARGET_OBJECTS:libgromacs_external>
+     $<TARGET_OBJECTS:libgromacs_generated>)
+
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+if (GMX_USE_OPENCL)
+    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
+    mark_as_advanced(GMX_EXTERNAL_CLFFT)
+
+    # Default to using clFFT found on the system
+    # switch to quiet at the second run.
+    if (DEFINED clFFT_LIBRARY)
+        set (clFFT_FIND_QUIETLY TRUE)
+    endif()
+    find_package(clFFT)
+    if (NOT clFFT_FOUND)
+        if (GMX_EXTERNAL_CLFFT)
+            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
+        endif()
+
+        if(MSVC)
+            message(FATAL_ERROR
+"An OpenCL build was requested with Visual Studio compiler, but GROMACS
+requires clFFT, which was not found on your system. GROMACS does bundle
+clFFT to help with building for OpenCL, but that clFFT has not yet been
+ported to the more recent versions of that compiler that GROMACS itself
+requires. Thus for now, OpenCL is not available with MSVC and the internal
+build of clFFT in GROMACS 2019. Either change compiler, try installing
+a clFFT package, or use the latest GROMACS 2018 point release.")
+        endif()
+
+        # Fall back on the internal version
+        set (_clFFT_dir ../external/clFFT/src)
+        add_subdirectory(${_clFFT_dir} clFFT-build)
+        target_sources(libgromacs PRIVATE
+            $<TARGET_OBJECTS:clFFT>
+        )
+        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
+        # Use the magic variable for how to link any library needed for
+        # dlopen, etc.  which is -ldl where needed, and empty otherwise
+        # (e.g. Windows, BSD, Mac).
+        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
+    else()
+        target_link_libraries(libgromacs PRIVATE clFFT)
+    endif()
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions). Same issue exists for nonbonded kernels
+# so we supress them for all generated files.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-unused -Wno-unused-parameter" HAS_NO_UNUSED)
+check_cxx_compiler_flag(-Wno-missing-declarations HAS_NO_MISSING_DECL)
+check_cxx_compiler_flag(-Wno-missing-prototypes HAS_NO_MISSING_PROTO)
+check_cxx_compiler_flag(/wd4101 HAS_NO_MSVC_UNUSED)
+if (NOT MSVC)
+    check_cxx_compiler_flag(-wd1419 HAS_DECL_IN_SOURCE)
+endif()
+if (HAS_NO_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-unused;-Wno-unused-parameter")
+endif()
+if (HAS_NO_MISSING_DECL)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-declarations")
+endif()
+# TODO The group scheme kernels don't use proper function prototype
+# declarations, and clang warns about such use, which we suppress
+# rather than fix. We would prefer to use no suppressions. However
+# other compilers do not support such a warning suppression for C++
+# source files, and issue warnings about that. Remove the use of
+# -Wno-missing-prototypes here and above when the group scheme is
+# removed.
+if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
+endif()
+if (HAS_NO_MSVC_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "/wd4101")
+endif()
+if (HAS_DECL_IN_SOURCE)
+    target_compile_options(libgromacs_generated PRIVATE "-wd1419")
+endif()
+
+if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
+    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
+    # with the SIMD_AVX_512_CXX_FLAGS flags.
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
+                      ${OpenCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      ${PLUMED_LOAD}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_manage_lmfit()
+target_link_libraries(libgromacs PRIVATE lmfit)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^6\.0")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
+endif()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
+     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
+     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
+     /wd6001  #unitialized memory
+     /wd6011  #derefencing NULL
+     /wd6053  #prior call not zero-terminate
+     /wd6054  #might not be zero-terminated
+     /wd6385  #reading invalid data
+     /wd6386  #buffer overrun
+     /wd6387  #could be '0'
+     /wd28199 #uninitialized memory
+     # For compile time constant (e.g. templates) the following warnings have flase postives
+     /wd6239  #(<non-zero> && <expr>)
+     /wd6240  #(<expr> && <non-zero>)
+     /wd6294  #Ill-defined for-loop
+     /wd6326  #comparison of constant with other constant
+     /wd28020 #expression involving paramter is not true
+     # Misc
+     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
+     /wd6993  #OpenMP ignored
+     #TODO
+     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
+     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
+     /wd6246  #hides declaration
+     >
+   )
+endif()
+
+if (GMX_CLANG_TIDY)
+   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
+       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
+endif()
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    # Install the utility headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        gpu_utils/vectype_ops.clh
+        gpu_utils/device_utils.clh
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        pbcutil/ishift.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
+        COMPONENT libraries)
+
+    # Install the NB source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
+        COMPONENT libraries)
+
+    # Install the PME source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        ewald/pme-spread.clh
+        ewald/pme-solve.clh
+        ewald/pme-gather.clh
+        ewald/pme-gpu-utils.clh
+        ewald/pme-program.cl
+        ewald/pme-gpu-types.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
+        COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..f94af553a91047274f879856836782856d62abcb
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -0,0 +1,445 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+add_library(libgromacs_external OBJECT "")
+if(CMAKE_COMPILER_IS_GNUCXX)
+    # Keep quiet about e.g. linearalgebra module
+    target_compile_options(libgromacs_external PRIVATE ${CXXFLAGS_NO_STRINGOP_TRUNCATION})
+endif()
+
+add_library(libgromacs_generated OBJECT "")
+if (BUILD_SHARED_LIBS)
+    set_target_properties(libgromacs_external PROPERTIES POSITION_INDEPENDENT_CODE true)
+    set_target_properties(libgromacs_generated PROPERTIES POSITION_INDEPENDENT_CODE true)
+endif()
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrun)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(restraint)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+add_subdirectory(compat)
+add_subdirectory(mimic)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+target_sources(libgromacs_external PRIVATE ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
+gmx_configure_version_file(
+    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH
+    EXTRA_VARS
+        GMX_SOURCE_DOI
+    )
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE}
+     $<TARGET_OBJECTS:libgromacs_external>
+     $<TARGET_OBJECTS:libgromacs_generated>)
+
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+if (GMX_USE_OPENCL)
+    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
+    mark_as_advanced(GMX_EXTERNAL_CLFFT)
+
+    # Default to using clFFT found on the system
+    # switch to quiet at the second run.
+    if (DEFINED clFFT_LIBRARY)
+        set (clFFT_FIND_QUIETLY TRUE)
+    endif()
+    find_package(clFFT)
+    if (NOT clFFT_FOUND)
+        if (GMX_EXTERNAL_CLFFT)
+            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
+        endif()
+
+        if(MSVC)
+            message(FATAL_ERROR
+"An OpenCL build was requested with Visual Studio compiler, but GROMACS
+requires clFFT, which was not found on your system. GROMACS does bundle
+clFFT to help with building for OpenCL, but that clFFT has not yet been
+ported to the more recent versions of that compiler that GROMACS itself
+requires. Thus for now, OpenCL is not available with MSVC and the internal
+build of clFFT in GROMACS 2019. Either change compiler, try installing
+a clFFT package, or use the latest GROMACS 2018 point release.")
+        endif()
+
+        # Fall back on the internal version
+        set (_clFFT_dir ../external/clFFT/src)
+        add_subdirectory(${_clFFT_dir} clFFT-build)
+        target_sources(libgromacs PRIVATE
+            $<TARGET_OBJECTS:clFFT>
+        )
+        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
+        # Use the magic variable for how to link any library needed for
+        # dlopen, etc.  which is -ldl where needed, and empty otherwise
+        # (e.g. Windows, BSD, Mac).
+        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
+    else()
+        target_link_libraries(libgromacs PRIVATE clFFT)
+    endif()
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions). Same issue exists for nonbonded kernels
+# so we supress them for all generated files.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-unused -Wno-unused-parameter" HAS_NO_UNUSED)
+check_cxx_compiler_flag(-Wno-missing-declarations HAS_NO_MISSING_DECL)
+check_cxx_compiler_flag(-Wno-missing-prototypes HAS_NO_MISSING_PROTO)
+check_cxx_compiler_flag(/wd4101 HAS_NO_MSVC_UNUSED)
+if (NOT MSVC)
+    check_cxx_compiler_flag(-wd1419 HAS_DECL_IN_SOURCE)
+endif()
+if (HAS_NO_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-unused;-Wno-unused-parameter")
+endif()
+if (HAS_NO_MISSING_DECL)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-declarations")
+endif()
+# TODO The group scheme kernels don't use proper function prototype
+# declarations, and clang warns about such use, which we suppress
+# rather than fix. We would prefer to use no suppressions. However
+# other compilers do not support such a warning suppression for C++
+# source files, and issue warnings about that. Remove the use of
+# -Wno-missing-prototypes here and above when the group scheme is
+# removed.
+if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
+endif()
+if (HAS_NO_MSVC_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "/wd4101")
+endif()
+if (HAS_DECL_IN_SOURCE)
+    target_compile_options(libgromacs_generated PRIVATE "-wd1419")
+endif()
+
+if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
+    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
+    # with the SIMD_AVX_512_CXX_FLAGS flags.
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
+                      ${OpenCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_manage_lmfit()
+target_link_libraries(libgromacs PRIVATE lmfit)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^6\.0")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
+endif()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
+     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
+     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
+     /wd6001  #unitialized memory
+     /wd6011  #derefencing NULL
+     /wd6053  #prior call not zero-terminate
+     /wd6054  #might not be zero-terminated
+     /wd6385  #reading invalid data
+     /wd6386  #buffer overrun
+     /wd6387  #could be '0'
+     /wd28199 #uninitialized memory
+     # For compile time constant (e.g. templates) the following warnings have flase postives
+     /wd6239  #(<non-zero> && <expr>)
+     /wd6240  #(<expr> && <non-zero>)
+     /wd6294  #Ill-defined for-loop
+     /wd6326  #comparison of constant with other constant
+     /wd28020 #expression involving paramter is not true
+     # Misc
+     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
+     /wd6993  #OpenMP ignored
+     #TODO
+     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
+     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
+     /wd6246  #hides declaration
+     >
+   )
+endif()
+
+if (GMX_CLANG_TIDY)
+   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
+       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
+endif()
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    # Install the utility headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        gpu_utils/vectype_ops.clh
+        gpu_utils/device_utils.clh
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        pbcutil/ishift.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
+        COMPONENT libraries)
+
+    # Install the NB source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
+        COMPONENT libraries)
+
+    # Install the PME source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        ewald/pme-spread.clh
+        ewald/pme-solve.clh
+        ewald/pme-gather.clh
+        ewald/pme-gpu-utils.clh
+        ewald/pme-program.cl
+        ewald/pme-gpu-types.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
+        COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..781a63d71a841ccc50b116703f3e088332b091fe
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp
@@ -0,0 +1,879 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/rf_util.h"
+#include "gromacs/mdlib/wall.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+int    plumedswitch=0;
+plumed plumedmain;
+/* END PLUMED */
+
+void ns(FILE               *fp,
+        t_forcerec         *fr,
+        matrix              box,
+        const gmx_groups_t *groups,
+        gmx_localtop_t     *top,
+        const t_mdatoms    *md,
+        const t_commrec    *cr,
+        t_nrnb             *nrnb,
+        gmx_bool            bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec           *fr,
+                       const t_inputrec     *ir,
+                       const t_idef         *idef,
+                       const t_commrec      *cr,
+                       const gmx_multisim_t *ms,
+                       t_nrnb               *nrnb,
+                       gmx_wallcycle_t       wcycle,
+                       const t_mdatoms      *md,
+                       rvec                  x[],
+                       history_t            *hist,
+                       rvec                 *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t       *enerd,
+                       t_fcdata             *fcd,
+                       matrix                box,
+                       t_lambda             *fepvals,
+                       real                 *lambda,
+                       const t_graph        *graph,
+                       const t_blocka       *excl,
+                       rvec                  mu_tot[],
+                       int                   flags,
+                       float                *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(*ir, *fr, box, *md, x,
+                                   forceWithVirial, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr, ms,
+                    idef, x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+                    flags);
+
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           (md->nChargePerturbed != 0) || (md->nTypePerturbed != 0),
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        // TODO it would be simpler if we just accumulated a single
+        // long-range virial contribution.
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, DOMAINDECOMP(cr),
+                                   x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      int plumedNeedsEnergy;
+      plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      if(!plumedNeedsEnergy) plumed_cmd(plumedmain,"performCalc",NULL);
+    }
+    /* END PLUMED */
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, const real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
+
+        for (gmx::index j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            const double dlam  = fepvals->all_lambda[j][i] - lambda[j];
+
+            enerpart_lambda   += dlam*enerd->dvdl_lin[j];
+
+            /* Constraints can not be evaluated at foreign lambdas, so we add
+             * a linear extrapolation. This is an approximation, but usually
+             * quite accurate since constraints change little between lambdas.
+             */
+            if ((j == efptBONDED && fepvals->separate_dvdl[efptBONDED]) ||
+                (j == efptFEP && !fepvals->separate_dvdl[efptBONDED]))
+            {
+                enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
+            }
+
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
+            {
+                enerpart_lambda += dlam*enerd->term[F_DKDL];
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        enerpart_lambda - enerd->enerpart_lambda[0],
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+
+    /* The constrain contribution is now included in other terms, so clear it */
+    enerd->term[F_DVDL_CONSTR] = 0;
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..8a7615365140ca43108e1a445b360f36696a93b3
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdlib/force.cpp.preplumed
@@ -0,0 +1,866 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/rf_util.h"
+#include "gromacs/mdlib/wall.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+void ns(FILE               *fp,
+        t_forcerec         *fr,
+        matrix              box,
+        const gmx_groups_t *groups,
+        gmx_localtop_t     *top,
+        const t_mdatoms    *md,
+        const t_commrec    *cr,
+        t_nrnb             *nrnb,
+        gmx_bool            bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec           *fr,
+                       const t_inputrec     *ir,
+                       const t_idef         *idef,
+                       const t_commrec      *cr,
+                       const gmx_multisim_t *ms,
+                       t_nrnb               *nrnb,
+                       gmx_wallcycle_t       wcycle,
+                       const t_mdatoms      *md,
+                       rvec                  x[],
+                       history_t            *hist,
+                       rvec                 *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t       *enerd,
+                       t_fcdata             *fcd,
+                       matrix                box,
+                       t_lambda             *fepvals,
+                       real                 *lambda,
+                       const t_graph        *graph,
+                       const t_blocka       *excl,
+                       rvec                  mu_tot[],
+                       int                   flags,
+                       float                *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(*ir, *fr, box, *md, x,
+                                   forceWithVirial, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr, ms,
+                    idef, x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+                    flags);
+
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           (md->nChargePerturbed != 0) || (md->nTypePerturbed != 0),
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        // TODO it would be simpler if we just accumulated a single
+        // long-range virial contribution.
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, DOMAINDECOMP(cr),
+                                   x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, const real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
+
+        for (gmx::index j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            const double dlam  = fepvals->all_lambda[j][i] - lambda[j];
+
+            enerpart_lambda   += dlam*enerd->dvdl_lin[j];
+
+            /* Constraints can not be evaluated at foreign lambdas, so we add
+             * a linear extrapolation. This is an approximation, but usually
+             * quite accurate since constraints change little between lambdas.
+             */
+            if ((j == efptBONDED && fepvals->separate_dvdl[efptBONDED]) ||
+                (j == efptFEP && !fepvals->separate_dvdl[efptBONDED]))
+            {
+                enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
+            }
+
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
+            {
+                enerpart_lambda += dlam*enerd->term[F_DKDL];
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        enerpart_lambda - enerd->enerpart_lambda[0],
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+
+    /* The constrain contribution is now included in other terms, so clear it */
+    enerd->term[F_DVDL_CONSTR] = 0;
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3702624867055f24915b8d10c98eda650b21eb08
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
@@ -0,0 +1,255 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "legacymdrunoptions.h"
+
+#include "config.h"
+
+#include <cstring>
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+namespace gmx
+{
+
+/*! \brief Return whether the command-line parameter that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+int LegacyMdrunOptions::updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc)
+{
+    unsigned long     PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multidir, the working directory still needs to be
+    // changed, so we can't check for the existence of files during
+    // parsing.  It isn't useful to do any completion based on file
+    // system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    if (!parse_common_args(&argc, argv, PCA_Flags,
+                           static_cast<int>(filenames.size()), filenames.data(), asize(pa), pa,
+                           static_cast<int>(desc.size()), desc.data(), 0, nullptr, &oenv))
+    {
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    // now check for a multi-simulation
+    ArrayRef<const std::string> multidir = opt2fnsIfOptionSet("-multidir",
+                                                              static_cast<int>(filenames.size()),
+                                                              filenames.data());
+
+    if (replExParams.exchangeInterval != 0 && multidir.size() < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (use option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    ms = init_multisystem(MPI_COMM_WORLD, multidir);
+
+    /* Prepare the intra-simulation communication */
+    // TODO consolidate this with init_commrec, after changing the
+    // relative ordering of init_commrec and init_multisystem
+#if GMX_MPI
+    if (ms != nullptr)
+    {
+        cr->nnodes = cr->nnodes / ms->nsim;
+        MPI_Comm_split(MPI_COMM_WORLD, ms->sim, cr->sim_nodeid, &cr->mpi_comm_mysim);
+        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
+        MPI_Comm_rank(cr->mpi_comm_mysim, &cr->sim_nodeid);
+        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
+    }
+#endif
+
+    if (!opt2bSet("-cpi",
+                  static_cast<int>(filenames.size()), filenames.data()))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, ms, bTryToAppendFiles,
+                  static_cast<int>(filenames.size()),
+                  filenames.data(),
+                  &continuationOptions.appendFiles,
+                  &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun",
+                                             static_cast<int>(filenames.size()),
+                                             filenames.data());
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
+    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
+    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
+
+    /* PLUMED */
+    plumedswitch=0;
+    if (opt2bSet("-plumed", static_cast<int>(filenames.size()), filenames.data())) plumedswitch=1;
+    if(plumedswitch){
+      int real_precision=sizeof(real);
+      real energyUnits=1.0;
+      real lengthUnits=1.0;
+      real timeUnits=1.0;
+  
+      if(!plumed_installed()){
+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
+      }
+      plumedmain=plumed_create();
+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
+      // this is not necessary for gromacs units:
+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
+      //
+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,static_cast<int>(filenames.size()), filenames.data()));
+      plumedswitch=1;
+    }
+    /* PLUMED HREX*/
+    if(getenv("PLUMED_HREX")) plumed_hrex=1;
+    if(plumed_hrex){
+      if(!plumedswitch) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) requires -plumed");
+      if(replExParams.exchangeInterval==0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) replica exchange");
+      if(replExParams.numExchanges!=0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) not compatible with -nex");
+    }
+    /* END PLUMED HREX */
+
+    /* END PLUMED */
+
+    return 1;
+}
+
+LegacyMdrunOptions::~LegacyMdrunOptions()
+{
+    if (GMX_LIB_MPI)
+    {
+        done_commrec(cr);
+    }
+    done_multisim(ms);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..de685e90348bfff377e9fe6cf20efa9fc2d516d6
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
@@ -0,0 +1,222 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "legacymdrunoptions.h"
+
+#include "config.h"
+
+#include <cstring>
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+namespace gmx
+{
+
+/*! \brief Return whether the command-line parameter that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+int LegacyMdrunOptions::updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc)
+{
+    unsigned long     PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multidir, the working directory still needs to be
+    // changed, so we can't check for the existence of files during
+    // parsing.  It isn't useful to do any completion based on file
+    // system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    if (!parse_common_args(&argc, argv, PCA_Flags,
+                           static_cast<int>(filenames.size()), filenames.data(), asize(pa), pa,
+                           static_cast<int>(desc.size()), desc.data(), 0, nullptr, &oenv))
+    {
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    // now check for a multi-simulation
+    ArrayRef<const std::string> multidir = opt2fnsIfOptionSet("-multidir",
+                                                              static_cast<int>(filenames.size()),
+                                                              filenames.data());
+
+    if (replExParams.exchangeInterval != 0 && multidir.size() < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (use option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    ms = init_multisystem(MPI_COMM_WORLD, multidir);
+
+    /* Prepare the intra-simulation communication */
+    // TODO consolidate this with init_commrec, after changing the
+    // relative ordering of init_commrec and init_multisystem
+#if GMX_MPI
+    if (ms != nullptr)
+    {
+        cr->nnodes = cr->nnodes / ms->nsim;
+        MPI_Comm_split(MPI_COMM_WORLD, ms->sim, cr->sim_nodeid, &cr->mpi_comm_mysim);
+        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
+        MPI_Comm_rank(cr->mpi_comm_mysim, &cr->sim_nodeid);
+        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
+    }
+#endif
+
+    if (!opt2bSet("-cpi",
+                  static_cast<int>(filenames.size()), filenames.data()))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, ms, bTryToAppendFiles,
+                  static_cast<int>(filenames.size()),
+                  filenames.data(),
+                  &continuationOptions.appendFiles,
+                  &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun",
+                                             static_cast<int>(filenames.size()),
+                                             filenames.data());
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
+    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
+    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
+
+    return 1;
+}
+
+LegacyMdrunOptions::~LegacyMdrunOptions()
+{
+    if (GMX_LIB_MPI)
+    {
+        done_commrec(cr);
+    }
+    done_multisim(ms);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..019a12439e4f9cf6031b8d82ca17f3d6426015c2
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h
@@ -0,0 +1,300 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ * \inlibraryapi
+ */
+#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrun/logging.h"
+
+#include "replicaexchange.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+struct gmx_multisim_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief This class provides the same command-line option
+ * functionality to both CLI and API sessions.
+ *
+ * This class should not exist, but is necessary now to introduce
+ * support for the CLI and API without duplicating code. It should be
+ * eliminated following the TODOs below.
+ *
+ * \todo Modules in mdrun should acquire proper option handling so
+ * that all of these declarations and defaults are local to the
+ * modules.
+ *
+ * \todo Contextual aspects, such as working directory, MPI
+ * environment, and environment variable handling are more properly
+ * the role of SimulationContext, and should be moved there */
+class LegacyMdrunOptions
+{
+    public:
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Parallelism-related user options.
+        gmx_hw_opt_t                     hw_opt;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+
+        //! Filename options to fill from command-line argument values.
+        std::vector<t_filenm> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-plumed",   "plumed",    ffOPTRD },   /* PLUMED */
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+
+        /*! \brief Output context for writing text files
+         *
+         * \todo Clarify initialization, ownership, and lifetime. */
+        gmx_output_env_t                *oenv = nullptr;
+
+        //! Handle to file used for logging.
+        LogFilePtr logFileGuard = nullptr;
+
+        /*! \brief Command line options, defaults, docs and storage for them to fill. */
+        /*! \{ */
+        rvec              realddxyz                                               = {0, 0, 0};
+        const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+        { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+        const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+        { nullptr, "auto", "no", "yes", nullptr };
+        const char       *thread_aff_opt_choices[threadaffNR+1] =
+        { nullptr, "auto", "on", "off", nullptr };
+        const char       *nbpu_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_fft_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *bonded_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        gmx_bool          bTryToAppendFiles     = TRUE;
+        const char       *gpuIdsAvailable       = "";
+        const char       *userGpuTaskAssignment = "";
+
+        ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+        t_pargs           pa[49] = {
+
+            { "-dd",      FALSE, etRVEC, {&realddxyz},
+              "Domain decomposition grid, 0 is optimize" },
+            { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+              "DD rank order" },
+            { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+              "Number of separate ranks to be used for PME, -1 is guess" },
+            { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+              "Total number of threads to start (0 is guess)" },
+            { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+              "Number of thread-MPI ranks to start (0 is guess)" },
+            { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+              "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+            { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+              "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+            { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+              "Whether mdrun should try to set thread affinities" },
+            { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+              "The lowest logical core number to which mdrun should pin the first thread" },
+            { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+              "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+            { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+              "List of unique GPU device IDs available to use" },
+            { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+              "List of GPU device IDs, mapping each PP task on each node to a device" },
+            { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+              "Check for all bonded interactions with DD" },
+            { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+              "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+            { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+              "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+            { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+              "Maximum distance for P-LINCS (nm), 0 is estimate" },
+            { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+              "Dynamic load balancing (with DD)" },
+            { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+              "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+              "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+            { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+              "HIDDENA string containing a vector of the relative sizes in the x "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+              "HIDDENA string containing a vector of the relative sizes in the y "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+              "HIDDENA string containing a vector of the relative sizes in the z "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+              "Global communication frequency" },
+            { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+              "Calculate non-bonded interactions on" },
+            { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+              "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+            { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+              "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+            { "-pme",     FALSE, etENUM, {pme_opt_choices},
+              "Perform PME calculations on" },
+            { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+              "Perform PME FFT calculations on" },
+            { "-bonded",     FALSE, etENUM, {bonded_opt_choices},
+              "Perform bonded calculations on" },
+            { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+              "Be loud and noisy" },
+            { "-pforce",  FALSE, etREAL, {&pforce},
+              "Print all forces larger than this (kJ/mol nm)" },
+            { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+              "Try to avoid optimizations that affect binary reproducibility" },
+            { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+              "Checkpoint interval (minutes)" },
+            { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+              "Keep and number checkpoint files" },
+            { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+              "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+            { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+              "Run this number of steps (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+            { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+              "Terminate after 0.99 times this time (hours)" },
+            { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+              "Attempt replica exchange periodically with this period (steps)" },
+            { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+              "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+            { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+              "Seed for replica exchange, -1 is generate a seed" },
+            { "-hrex",  FALSE, etBOOL, {&plumed_hrex}, /* PLUMED HREX */
+              "Enable hamiltonian replica exchange" },
+            { "-imdport",    FALSE, etINT, {&imdOptions.port},
+              "HIDDENIMD listening port" },
+            { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+              "HIDDENPause the simulation while no IMD client is connected" },
+            { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+              "HIDDENAllow termination of the simulation from IMD client" },
+            { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+              "HIDDENAllow pulling in the simulation from IMD client" },
+            { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+              "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+            { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+              "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+            { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+              "HIDDENFrequency of writing the remaining wall clock time for the run" },
+            { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+              "HIDDENReset cycle counters after these many time steps" },
+            { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+              "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+        };
+        /*! \} */
+
+        //! Handle to communication object.
+        t_commrec        *cr = nullptr;
+        //! Multi-simulation object.
+        gmx_multisim_t   *ms = nullptr;
+
+        //! Parses the command-line input and prepares to start mdrun.
+        int updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc);
+
+        ~LegacyMdrunOptions();
+};
+
+} // end namespace gmx
+
+#endif
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..27a6de629a3ef033a86802560cbb175e748ee4c9
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ * \inlibraryapi
+ */
+#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrun/logging.h"
+
+#include "replicaexchange.h"
+
+struct gmx_multisim_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief This class provides the same command-line option
+ * functionality to both CLI and API sessions.
+ *
+ * This class should not exist, but is necessary now to introduce
+ * support for the CLI and API without duplicating code. It should be
+ * eliminated following the TODOs below.
+ *
+ * \todo Modules in mdrun should acquire proper option handling so
+ * that all of these declarations and defaults are local to the
+ * modules.
+ *
+ * \todo Contextual aspects, such as working directory, MPI
+ * environment, and environment variable handling are more properly
+ * the role of SimulationContext, and should be moved there */
+class LegacyMdrunOptions
+{
+    public:
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Parallelism-related user options.
+        gmx_hw_opt_t                     hw_opt;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+
+        //! Filename options to fill from command-line argument values.
+        std::vector<t_filenm> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+
+        /*! \brief Output context for writing text files
+         *
+         * \todo Clarify initialization, ownership, and lifetime. */
+        gmx_output_env_t                *oenv = nullptr;
+
+        //! Handle to file used for logging.
+        LogFilePtr logFileGuard = nullptr;
+
+        /*! \brief Command line options, defaults, docs and storage for them to fill. */
+        /*! \{ */
+        rvec              realddxyz                                               = {0, 0, 0};
+        const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+        { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+        const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+        { nullptr, "auto", "no", "yes", nullptr };
+        const char       *thread_aff_opt_choices[threadaffNR+1] =
+        { nullptr, "auto", "on", "off", nullptr };
+        const char       *nbpu_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_fft_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *bonded_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        gmx_bool          bTryToAppendFiles     = TRUE;
+        const char       *gpuIdsAvailable       = "";
+        const char       *userGpuTaskAssignment = "";
+
+        ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+        t_pargs           pa[48] = {
+
+            { "-dd",      FALSE, etRVEC, {&realddxyz},
+              "Domain decomposition grid, 0 is optimize" },
+            { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+              "DD rank order" },
+            { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+              "Number of separate ranks to be used for PME, -1 is guess" },
+            { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+              "Total number of threads to start (0 is guess)" },
+            { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+              "Number of thread-MPI ranks to start (0 is guess)" },
+            { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+              "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+            { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+              "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+            { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+              "Whether mdrun should try to set thread affinities" },
+            { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+              "The lowest logical core number to which mdrun should pin the first thread" },
+            { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+              "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+            { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+              "List of unique GPU device IDs available to use" },
+            { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+              "List of GPU device IDs, mapping each PP task on each node to a device" },
+            { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+              "Check for all bonded interactions with DD" },
+            { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+              "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+            { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+              "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+            { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+              "Maximum distance for P-LINCS (nm), 0 is estimate" },
+            { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+              "Dynamic load balancing (with DD)" },
+            { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+              "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+              "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+            { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+              "HIDDENA string containing a vector of the relative sizes in the x "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+              "HIDDENA string containing a vector of the relative sizes in the y "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+              "HIDDENA string containing a vector of the relative sizes in the z "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+              "Global communication frequency" },
+            { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+              "Calculate non-bonded interactions on" },
+            { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+              "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+            { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+              "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+            { "-pme",     FALSE, etENUM, {pme_opt_choices},
+              "Perform PME calculations on" },
+            { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+              "Perform PME FFT calculations on" },
+            { "-bonded",     FALSE, etENUM, {bonded_opt_choices},
+              "Perform bonded calculations on" },
+            { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+              "Be loud and noisy" },
+            { "-pforce",  FALSE, etREAL, {&pforce},
+              "Print all forces larger than this (kJ/mol nm)" },
+            { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+              "Try to avoid optimizations that affect binary reproducibility" },
+            { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+              "Checkpoint interval (minutes)" },
+            { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+              "Keep and number checkpoint files" },
+            { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+              "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+            { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+              "Run this number of steps (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+            { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+              "Terminate after 0.99 times this time (hours)" },
+            { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+              "Attempt replica exchange periodically with this period (steps)" },
+            { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+              "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+            { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+              "Seed for replica exchange, -1 is generate a seed" },
+            { "-imdport",    FALSE, etINT, {&imdOptions.port},
+              "HIDDENIMD listening port" },
+            { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+              "HIDDENPause the simulation while no IMD client is connected" },
+            { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+              "HIDDENAllow termination of the simulation from IMD client" },
+            { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+              "HIDDENAllow pulling in the simulation from IMD client" },
+            { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+              "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+            { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+              "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+            { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+              "HIDDENFrequency of writing the remaining wall clock time for the run" },
+            { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+              "HIDDENReset cycle counters after these many time steps" },
+            { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+              "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+        };
+        /*! \} */
+
+        //! Handle to communication object.
+        t_commrec        *cr = nullptr;
+        //! Multi-simulation object.
+        gmx_multisim_t   *ms = nullptr;
+
+        //! Parses the command-line input and prepares to start mdrun.
+        int updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc);
+
+        ~LegacyMdrunOptions();
+};
+
+} // end namespace gmx
+
+#endif
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..887526a8d1dff7bdb05905ed784c1e500086501c
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp
@@ -0,0 +1,1728 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the integrator for normal molecular dynamics simulations
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <memory>
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/checkpointhandler.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/resethandler.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/pullhistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+using gmx::SimulationSignaller;
+
+void gmx::Integrator::do_md()
+{
+    // TODO Historically, the EM and MD "integrators" used different
+    // names for the t_inputrec *parameter, but these must have the
+    // same name, now that it's a member of a struct. We use this ir
+    // alias to avoid a large ripple of nearly useless changes.
+    // t_inputrec is being replaced by IMdpOptionsProvider, so this
+    // will go away eventually.
+    t_inputrec             *ir   = inputrec;
+    gmx_mdoutf             *outf = nullptr;
+    int64_t                 step, step_rel;
+    double                  t, t0, lam0[efptNR];
+    gmx_bool                bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool                bNS, bNStList, bSimAnn, bStopCM,
+                            bFirstStep, bInitStep, bLastStep = FALSE;
+    gmx_bool                bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool                do_ene, do_log, do_verbose;
+    gmx_bool                bMasterState;
+    int                     force_flags, cglo_flags;
+    tensor                  force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int                     i, m;
+    rvec                    mu_tot;
+    t_vcm                  *vcm;
+    matrix                  parrinellorahmanMu, M;
+    gmx_repl_ex_t           repl_ex = nullptr;
+    gmx_localtop_t         *top;
+    t_mdebin               *mdebin   = nullptr;
+    gmx_enerdata_t         *enerd;
+    PaddedVector<gmx::RVec> f {};
+    gmx_global_stat_t       gstat;
+    gmx_update_t           *upd   = nullptr;
+    t_graph                *graph = nullptr;
+    gmx_groups_t           *groups;
+    gmx_ekindata_t         *ekind;
+    gmx_shellfc_t          *shellfc;
+    gmx_bool                bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool                bTemp, bPres, bTrotter;
+    real                    dvdl_constr;
+    rvec                   *cbuf        = nullptr;
+    int                     cbuf_nalloc = 0;
+    matrix                  lastbox;
+    int                     lamnew  = 0;
+    /* for FEP */
+    int                     nstfep = 0;
+    double                  cycles;
+    real                    saved_conserved_quantity = 0;
+    real                    last_ekin                = 0;
+    t_extmass               MassQ;
+    int                   **trotter_seq;
+    char                    sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    int plumedWantsToStop=0;
+    matrix plumed_vir;
+    /* END PLUMED */
+
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const bool bRerunMD      = false;
+    int        nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir, cr);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    groups = &top_global->groups;
+
+    std::unique_ptr<EssentialDynamics> ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(mdlog,
+                        opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd, deform,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, total_vir, pres, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, constr ? constr->numFlexibleConstraints() : 0,
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, ms, top_global, fplog, ir->nstcalcenergy,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        f.resizeWithPadding(state_global->natoms);
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, constr, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        /* Check nstexpanded here, because the grompp check was broken */
+        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
+        {
+            gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
+        }
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                /* If no history is available (because a checkpoint is from before
+                 * it was written) make a new one later, otherwise restore it.
+                 */
+                if (observablesHistory->energyHistory)
+                {
+                    restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+                }
+            }
+            else if (observablesHistory->energyHistory)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+                /* We might have read a pull history from checkpoint.
+                 * We will still want to keep the statistics, so that the files
+                 * can be joined and still be meaningful.
+                 * This means that observablesHistory->pullHistory
+                 * should not be reset.
+                 */
+            }
+        }
+        if (!observablesHistory->energyHistory)
+        {
+            observablesHistory->energyHistory = compat::make_unique<energyhistory_t>();
+        }
+        if (!observablesHistory->pullHistory)
+        {
+            observablesHistory->pullHistory = compat::make_unique<PullHistory>();
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    preparePrevStepPullCom(ir, mdatoms, state, state_global, cr, startingFromCheckpoint);
+
+    // TODO: Remove this by converting AWH into a ForceProvider
+    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingFromCheckpoint,
+                                shellfc != nullptr,
+                                opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
+                         *fr->ic, *fr->nbv->listParams, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation)
+    {
+        if (state->flags & (1 << estV))
+        {
+            auto v = makeArrayRef(state->v);
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr->rmsd());
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        char tbuf[20];
+        fprintf(stderr, "starting mdrun '%s'\n",
+                *(top_global->name));
+        if (ir->nsteps >= 0)
+        {
+            sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+        }
+        else
+        {
+            sprintf(tbuf, "%s", "infinite");
+        }
+        if (ir->init_step > 0)
+        {
+            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                    gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                    gmx_step_str(ir->init_step, sbuf2),
+                    ir->init_step*ir->delta_t);
+        }
+        else
+        {
+            fprintf(stderr, "%s steps, %s ps.\n",
+                    gmx_step_str(ir->nsteps, sbuf), tbuf);
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      /* detect plumed API version */
+      int pversion=0;
+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
+      /* setting kbT is only implemented with api>1) */
+      real kbT=ir->opts.ref_t[0]*BOLTZ;
+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
+      if(pversion>2){
+        int res=1;
+        if( (continuationOptions.startedFromCheckpoint) ) plumed_cmd(plumedmain,"setRestart",&res);
+      }
+
+      if(ms && ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          int nat_home = dd_numHomeAtoms(*cr->dd);
+          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+        }
+      }
+    }
+    /* END PLUMED */
+
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+#if GMX_FAHCORE
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+    int chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                          NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim || (plumedswitch && ms); // PLUMED hack, if we have multiple sim and plumed we usually want them to be in sync 
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
+                compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
+                MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
+                mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
+
+    auto checkpointHandler = compat::make_unique<CheckpointHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
+                simulationsShareState, ir->nstlist == 0, MASTER(cr),
+                mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
+
+    const bool resetCountersIsLocal = true;
+    auto       resetHandler         = compat::make_unique<ResetHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
+                ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
+                mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           *ir, fr, *state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        bLastStep = (step_rel == ir->nsteps);
+        t         = t0 + step*ir->delta_t;
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas */
+            setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Determine whether or not to do Neighbour Searching */
+        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+
+        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation))
+        {
+            bMasterState = FALSE;
+            /* Correct the new box if it is too skewed */
+            if (inputrecDynamicBox(ir))
+            {
+                if (correct_box(fplog, step, state->box, graph))
+                {
+                    bMasterState = TRUE;
+                }
+            }
+            if (DOMAINDECOMP(cr) && bMasterState)
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, mdlog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+
+                /* PLUMED */
+                if(plumedswitch){
+                  int nat_home = dd_numHomeAtoms(*cr->dd);
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+                }
+                /* END PLUMED */
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if (bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* PLUMED HREX */
+        gmx_bool bHREX = bDoReplEx && plumed_hrex;
+
+        if (plumedswitch && bHREX) {
+          gmx_enerdata_t *hrex_enerd;
+          snew(hrex_enerd, 1);
+          init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,hrex_enerd);
+          int repl  = -1;
+          int nrepl = -1;
+          if (MASTER(cr)){
+            repl  = replica_exchange_get_repl(repl_ex);
+            nrepl = replica_exchange_get_nrepl(repl_ex);
+          }
+
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
+            }
+          }
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+            }
+          }
+          do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                   step, nrnb, wcycle, top, groups,
+                   state->box, state->x.arrayRefWithPadding(), &state->hist,
+                   f.arrayRefWithPadding(), force_vir, mdatoms, hrex_enerd, fcd,
+                   state->lambda, graph,
+                   fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                   GMX_FORCE_STATECHANGED |
+                   GMX_FORCE_DYNAMICBOX |
+                   GMX_FORCE_ALLFORCES |
+                   GMX_FORCE_VIRIAL |
+                   GMX_FORCE_ENERGY |
+                   GMX_FORCE_DHDL |
+                   GMX_FORCE_NS,
+                   ddOpenBalanceRegion, ddCloseBalanceRegion);
+
+          plumed_cmd(plumedmain,"GREX cacheLocalUSwap",&hrex_enerd->term[F_EPOT]);
+          sfree(hrex_enerd);
+
+          /* exchange back */
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
+            }
+          }
+
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+              int nat_home = dd_numHomeAtoms(*cr->dd);
+              plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+              plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+            }
+          }
+        }
+        /* END PLUMED HREX */
+
+        checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
+                                enforcedRotation, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f.arrayRefWithPadding(), force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, ppForceWorkload, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
+            {
+                awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+
+            /* PLUMED */
+            plumedNeedsEnergy=0;
+            if(plumedswitch){
+              int pversion=0;
+              plumed_cmd(plumedmain,"getApiVersion",&pversion);
+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
+              plumed_cmd(plumedmain,"prepareCalc",NULL);
+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
+              int checkp=0; if(checkpointHandler->isCheckpointingStep()) checkp=1;
+              if(pversion>3) plumed_cmd(plumedmain,"doCheckPoint",&checkp);
+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+              if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
+              clear_mat(plumed_vir);
+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
+            }
+            /* END PLUMED */
+            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                     step, nrnb, wcycle, top, groups,
+                     state->box, state->x.arrayRefWithPadding(), &state->hist,
+                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+            /* PLUMED */
+            if(plumedswitch){
+              if(plumedNeedsEnergy){
+                msmul(force_vir,2.0,plumed_vir);
+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+                plumed_cmd(plumedmain,"performCalc",NULL);
+                msmul(plumed_vir,0.5,force_vir);
+              } else {
+                msmul(plumed_vir,0.5,plumed_vir);
+                m_add(force_vir,plumed_vir,force_vir);
+              }
+              if(bDoReplEx) plumed_cmd(plumedmain,"GREX savePositions",NULL);
+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
+              if(bHREX) plumed_cmd(plumedmain,"GREX cacheLocalUNow",&enerd->term[F_EPOT]);
+            }
+            /* END PLUMED */
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            wallcycle_stop(wcycle, ewcUPDATE);
+            constrain_velocities(step, nullptr,
+                                 state,
+                                 shake_vir,
+                                 constr,
+                                 bCalcVir, do_log, do_ene);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (bCalcEner ? CGLO_ENERGY : 0)
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 checkpointHandler->isCheckpointingStep(),
+                                 bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x.rvec_array(), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        stopHandler->setSignal();
+        resetHandler->setSignal(walltime_accounting);
+
+        if (bGStat || !PAR(cr))
+        {
+            /* In parallel we only have to check for checkpointing in steps
+             * where we do global communication,
+             *  otherwise the other nodes don't know.
+             */
+            checkpointHandler->setSignal(walltime_accounting);
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                constrain_velocities(step, nullptr,
+                                     state,
+                                     tmp_vir,
+                                     constr,
+                                     bCalcVir, do_log, do_ene);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        wallcycle_start(wcycle, ewcUPDATE);
+        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+        if (bTrotter)
+        {
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+            /* We can only do Berendsen coupling after we have summed
+             * the kinetic energy or virial. Since the happens
+             * in global_state after update, we should only do it at
+             * step % nstlist = 1 with bGStatEveryStep=FALSE.
+             */
+        }
+        else
+        {
+            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+            update_pcouple_before_coordinates(fplog, step, ir, state,
+                                              parrinellorahmanMu, M,
+                                              bInitStep);
+        }
+
+        if (EI_VV(ir->eI))
+        {
+            /* velocity half-step update */
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY2,
+                          cr, constr);
+        }
+
+        /* Above, initialize just copies ekinh into ekin,
+         * it doesn't copy position (for VV),
+         * and entire integrator for MD.
+         */
+
+        if (ir->eI == eiVVAK)
+        {
+            /* We probably only need md->homenr, not state->natoms */
+            if (state->natoms > cbuf_nalloc)
+            {
+                cbuf_nalloc = state->natoms;
+                srenew(cbuf, cbuf_nalloc);
+            }
+            copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+        }
+
+        update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                      ekind, M, upd, etrtPOSITION, cr, constr);
+        wallcycle_stop(wcycle, ewcUPDATE);
+
+        constrain_coordinates(step, &dvdl_constr, state,
+                              shake_vir,
+                              upd, constr,
+                              bCalcVir, do_log, do_ene);
+        update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
+                              cr, nrnb, wcycle, upd, constr, do_log, do_ene);
+        finish_update(ir, mdatoms,
+                      state, graph,
+                      nrnb, wcycle, upd, constr);
+
+        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
+        {
+            updatePrevStepPullCom(ir->pull_work, state);
+        }
+
+        if (ir->eI == eiVVAK)
+        {
+            /* erase F_EKIN and F_TEMP here? */
+            /* just compute the kinetic energy at the half step to perform a trotter step */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &nullSignaller, lastbox,
+                            nullptr, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                            );
+            wallcycle_start(wcycle, ewcUPDATE);
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+            /* now we know the scaling, we can compute the positions again again */
+            copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+            /* are the small terms in the shake_vir here due
+             * to numerical errors, or are they important
+             * physically? I'm thinking they are just errors, but not completely sure.
+             * For now, will call without actually constraining, constr=NULL*/
+            finish_update(ir, mdatoms,
+                          state, graph,
+                          nrnb, wcycle, upd, nullptr);
+        }
+        if (EI_VV(ir->eI))
+        {
+            /* this factor or 2 correction is necessary
+               because half of the constraint force is removed
+               in the vv step, so we have to double it.  See
+               the Redmine issue #1255.  It is not yet clear
+               if the factor of 2 is exact, or just a very
+               good approximation, and this will be
+               investigated.  The next step is to see if this
+               can be done adding a dhdl contribution from the
+               rattle step, but this is somewhat more
+               complicated with the current code. Will be
+               investigated, hopefully for 4.6.3. However,
+               this current solution is much better than
+               having it completely wrong.
+             */
+            enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+        }
+        else
+        {
+            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, state->x.rvec_array());
+            }
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, state->x.rvec_array());
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && !EI_VV(ir->eI))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh.get());
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (isMasterSimMasterRank(ms, cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             as_rvec_array(state->x.data()),
+                                             state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        /* increase the MD step number */
+        step++;
+        step_rel++;
+
+        resetHandler->resetCounters(
+                step, step_rel, mdlog, fplog, cr, (use_GPU(fr->nbv) ? fr->nbv : nullptr),
+                nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end_time(walltime_accounting);
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), awh.get());
+        }
+    }
+    done_mdebin(mdebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    destroy_enerdata(enerd);
+    sfree(enerd);
+    sfree(top);
+}
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a49a15df53a4246970f4f4a00e542f1c9a7cedbe
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/md.cpp.preplumed
@@ -0,0 +1,1531 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the integrator for normal molecular dynamics simulations
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <memory>
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/checkpointhandler.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/resethandler.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/pullhistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+using gmx::SimulationSignaller;
+
+void gmx::Integrator::do_md()
+{
+    // TODO Historically, the EM and MD "integrators" used different
+    // names for the t_inputrec *parameter, but these must have the
+    // same name, now that it's a member of a struct. We use this ir
+    // alias to avoid a large ripple of nearly useless changes.
+    // t_inputrec is being replaced by IMdpOptionsProvider, so this
+    // will go away eventually.
+    t_inputrec             *ir   = inputrec;
+    gmx_mdoutf             *outf = nullptr;
+    int64_t                 step, step_rel;
+    double                  t, t0, lam0[efptNR];
+    gmx_bool                bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool                bNS, bNStList, bSimAnn, bStopCM,
+                            bFirstStep, bInitStep, bLastStep = FALSE;
+    gmx_bool                bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool                do_ene, do_log, do_verbose;
+    gmx_bool                bMasterState;
+    int                     force_flags, cglo_flags;
+    tensor                  force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int                     i, m;
+    rvec                    mu_tot;
+    t_vcm                  *vcm;
+    matrix                  parrinellorahmanMu, M;
+    gmx_repl_ex_t           repl_ex = nullptr;
+    gmx_localtop_t         *top;
+    t_mdebin               *mdebin   = nullptr;
+    gmx_enerdata_t         *enerd;
+    PaddedVector<gmx::RVec> f {};
+    gmx_global_stat_t       gstat;
+    gmx_update_t           *upd   = nullptr;
+    t_graph                *graph = nullptr;
+    gmx_groups_t           *groups;
+    gmx_ekindata_t         *ekind;
+    gmx_shellfc_t          *shellfc;
+    gmx_bool                bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool                bTemp, bPres, bTrotter;
+    real                    dvdl_constr;
+    rvec                   *cbuf        = nullptr;
+    int                     cbuf_nalloc = 0;
+    matrix                  lastbox;
+    int                     lamnew  = 0;
+    /* for FEP */
+    int                     nstfep = 0;
+    double                  cycles;
+    real                    saved_conserved_quantity = 0;
+    real                    last_ekin                = 0;
+    t_extmass               MassQ;
+    int                   **trotter_seq;
+    char                    sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const bool bRerunMD      = false;
+    int        nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir, cr);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    groups = &top_global->groups;
+
+    std::unique_ptr<EssentialDynamics> ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(mdlog,
+                        opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd, deform,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, total_vir, pres, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, constr ? constr->numFlexibleConstraints() : 0,
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, ms, top_global, fplog, ir->nstcalcenergy,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        f.resizeWithPadding(state_global->natoms);
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, constr, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        /* Check nstexpanded here, because the grompp check was broken */
+        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
+        {
+            gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
+        }
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                /* If no history is available (because a checkpoint is from before
+                 * it was written) make a new one later, otherwise restore it.
+                 */
+                if (observablesHistory->energyHistory)
+                {
+                    restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+                }
+            }
+            else if (observablesHistory->energyHistory)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+                /* We might have read a pull history from checkpoint.
+                 * We will still want to keep the statistics, so that the files
+                 * can be joined and still be meaningful.
+                 * This means that observablesHistory->pullHistory
+                 * should not be reset.
+                 */
+            }
+        }
+        if (!observablesHistory->energyHistory)
+        {
+            observablesHistory->energyHistory = compat::make_unique<energyhistory_t>();
+        }
+        if (!observablesHistory->pullHistory)
+        {
+            observablesHistory->pullHistory = compat::make_unique<PullHistory>();
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    preparePrevStepPullCom(ir, mdatoms, state, state_global, cr, startingFromCheckpoint);
+
+    // TODO: Remove this by converting AWH into a ForceProvider
+    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingFromCheckpoint,
+                                shellfc != nullptr,
+                                opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
+                         *fr->ic, *fr->nbv->listParams, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation)
+    {
+        if (state->flags & (1 << estV))
+        {
+            auto v = makeArrayRef(state->v);
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr->rmsd());
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        char tbuf[20];
+        fprintf(stderr, "starting mdrun '%s'\n",
+                *(top_global->name));
+        if (ir->nsteps >= 0)
+        {
+            sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+        }
+        else
+        {
+            sprintf(tbuf, "%s", "infinite");
+        }
+        if (ir->init_step > 0)
+        {
+            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                    gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                    gmx_step_str(ir->init_step, sbuf2),
+                    ir->init_step*ir->delta_t);
+        }
+        else
+        {
+            fprintf(stderr, "%s steps, %s ps.\n",
+                    gmx_step_str(ir->nsteps, sbuf), tbuf);
+        }
+        fprintf(fplog, "\n");
+    }
+
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+#if GMX_FAHCORE
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+    int chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                          NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
+                compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
+                MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
+                mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
+
+    auto checkpointHandler = compat::make_unique<CheckpointHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
+                simulationsShareState, ir->nstlist == 0, MASTER(cr),
+                mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
+
+    const bool resetCountersIsLocal = true;
+    auto       resetHandler         = compat::make_unique<ResetHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
+                ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
+                mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           *ir, fr, *state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        bLastStep = (step_rel == ir->nsteps);
+        t         = t0 + step*ir->delta_t;
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas */
+            setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Determine whether or not to do Neighbour Searching */
+        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+
+        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation))
+        {
+            bMasterState = FALSE;
+            /* Correct the new box if it is too skewed */
+            if (inputrecDynamicBox(ir))
+            {
+                if (correct_box(fplog, step, state->box, graph))
+                {
+                    bMasterState = TRUE;
+                }
+            }
+            if (DOMAINDECOMP(cr) && bMasterState)
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, mdlog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if (bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
+                                enforcedRotation, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f.arrayRefWithPadding(), force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, ppForceWorkload, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
+            {
+                awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                     step, nrnb, wcycle, top, groups,
+                     state->box, state->x.arrayRefWithPadding(), &state->hist,
+                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            wallcycle_stop(wcycle, ewcUPDATE);
+            constrain_velocities(step, nullptr,
+                                 state,
+                                 shake_vir,
+                                 constr,
+                                 bCalcVir, do_log, do_ene);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (bCalcEner ? CGLO_ENERGY : 0)
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 checkpointHandler->isCheckpointingStep(),
+                                 bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x.rvec_array(), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        stopHandler->setSignal();
+        resetHandler->setSignal(walltime_accounting);
+
+        if (bGStat || !PAR(cr))
+        {
+            /* In parallel we only have to check for checkpointing in steps
+             * where we do global communication,
+             *  otherwise the other nodes don't know.
+             */
+            checkpointHandler->setSignal(walltime_accounting);
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                constrain_velocities(step, nullptr,
+                                     state,
+                                     tmp_vir,
+                                     constr,
+                                     bCalcVir, do_log, do_ene);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        wallcycle_start(wcycle, ewcUPDATE);
+        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+        if (bTrotter)
+        {
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+            /* We can only do Berendsen coupling after we have summed
+             * the kinetic energy or virial. Since the happens
+             * in global_state after update, we should only do it at
+             * step % nstlist = 1 with bGStatEveryStep=FALSE.
+             */
+        }
+        else
+        {
+            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+            update_pcouple_before_coordinates(fplog, step, ir, state,
+                                              parrinellorahmanMu, M,
+                                              bInitStep);
+        }
+
+        if (EI_VV(ir->eI))
+        {
+            /* velocity half-step update */
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY2,
+                          cr, constr);
+        }
+
+        /* Above, initialize just copies ekinh into ekin,
+         * it doesn't copy position (for VV),
+         * and entire integrator for MD.
+         */
+
+        if (ir->eI == eiVVAK)
+        {
+            /* We probably only need md->homenr, not state->natoms */
+            if (state->natoms > cbuf_nalloc)
+            {
+                cbuf_nalloc = state->natoms;
+                srenew(cbuf, cbuf_nalloc);
+            }
+            copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+        }
+
+        update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                      ekind, M, upd, etrtPOSITION, cr, constr);
+        wallcycle_stop(wcycle, ewcUPDATE);
+
+        constrain_coordinates(step, &dvdl_constr, state,
+                              shake_vir,
+                              upd, constr,
+                              bCalcVir, do_log, do_ene);
+        update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
+                              cr, nrnb, wcycle, upd, constr, do_log, do_ene);
+        finish_update(ir, mdatoms,
+                      state, graph,
+                      nrnb, wcycle, upd, constr);
+
+        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
+        {
+            updatePrevStepPullCom(ir->pull_work, state);
+        }
+
+        if (ir->eI == eiVVAK)
+        {
+            /* erase F_EKIN and F_TEMP here? */
+            /* just compute the kinetic energy at the half step to perform a trotter step */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &nullSignaller, lastbox,
+                            nullptr, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                            );
+            wallcycle_start(wcycle, ewcUPDATE);
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+            /* now we know the scaling, we can compute the positions again again */
+            copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+            /* are the small terms in the shake_vir here due
+             * to numerical errors, or are they important
+             * physically? I'm thinking they are just errors, but not completely sure.
+             * For now, will call without actually constraining, constr=NULL*/
+            finish_update(ir, mdatoms,
+                          state, graph,
+                          nrnb, wcycle, upd, nullptr);
+        }
+        if (EI_VV(ir->eI))
+        {
+            /* this factor or 2 correction is necessary
+               because half of the constraint force is removed
+               in the vv step, so we have to double it.  See
+               the Redmine issue #1255.  It is not yet clear
+               if the factor of 2 is exact, or just a very
+               good approximation, and this will be
+               investigated.  The next step is to see if this
+               can be done adding a dhdl contribution from the
+               rattle step, but this is somewhat more
+               complicated with the current code. Will be
+               investigated, hopefully for 4.6.3. However,
+               this current solution is much better than
+               having it completely wrong.
+             */
+            enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+        }
+        else
+        {
+            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, state->x.rvec_array());
+            }
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, state->x.rvec_array());
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && !EI_VV(ir->eI))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh.get());
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (isMasterSimMasterRank(ms, cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             as_rvec_array(state->x.data()),
+                                             state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        /* increase the MD step number */
+        step++;
+        step_rel++;
+
+        resetHandler->resetCounters(
+                step, step_rel, mdlog, fplog, cr, (use_GPU(fr->nbv) ? fr->nbv : nullptr),
+                nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end_time(walltime_accounting);
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), awh.get());
+        }
+    }
+    done_mdebin(mdebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    destroy_enerdata(enerd);
+    sfree(enerd);
+    sfree(top);
+}
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fb913acdf7fda216bd50c6a63cb04f973064344
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp
@@ -0,0 +1,3063 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state                 s;
+    //! Force array
+    PaddedVector<gmx::RVec> f;
+    //! Potential energy
+    real                    epot;
+    //! Norm of the force
+    real                    fnorm;
+    //! Maximum force
+    real                    fmax;
+    //! Direction
+    int                     a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           const t_commrec          *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end_time(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE     *fp,
+                      real      ftol,
+                      real      fmax,
+                      gmx_bool  bLastStep,
+                      gmx_bool  bConstrain)
+{
+    constexpr bool realIsDouble = GMX_DOUBLE;
+    char           buffer[2048];
+
+    if (!std::isfinite(fmax))
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped because the force "
+                "on at least one atom is not finite. This usually means "
+                "atoms are overlapping. Modify the input coordinates to "
+                "remove atom overlap or use soft-core potentials with "
+                "the free energy code to avoid infinite forces.\n%s",
+                !realIsDouble ?
+                "You could also be lucky that switching to double precision "
+                "is sufficient to obtain finite forces.\n" :
+                "");
+    }
+    else if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                !realIsDouble ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+
+    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            int64_t count, gmx_bool bDone, int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(const t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->globalAtomIndices[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = gmx::roundToInt(sum[2*i+1]);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(const t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f.rvec_array(),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog,
+                    const gmx::MDLogger &mdlog,
+                    const char *title,
+                    const t_commrec *cr,
+                    const gmx_multisim_t *ms,
+                    gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx::Constraints *constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, ms, top_global, fplog, 1,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      constr ? constr->numFlexibleConstraints() : 0,
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        ems->f.resizeWithPadding(ems->s.natoms);
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  constr, vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        // TODO how should this cross-module support dependency be managed?
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constr->apply(TRUE, TRUE,
+                          -1, 0, 1.0,
+                          ems->s.x.rvec_array(),
+                          ems->s.x.rvec_array(),
+                          nullptr,
+                          ems->s.box,
+                          ems->s.lambda[efptFEP], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+
+    /* PLUMED */
+    if(plumedswitch){
+      if(ms && ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }else{
+          plumed_cmd(plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          int nat_home = dd_numHomeAtoms(*cr->dd);
+          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+
+        }
+      }
+    }
+    /* END PLUMED */
+}
+
+//! Finalize the minimization
+static void finish_em(const t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, const t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, static_cast<double>(step),
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr)
+    {
+        if (DOMAINDECOMP(cr))
+        {
+            /* If bX=true, x was collected to state_global in the call above */
+            if (!bX)
+            {
+                gmx::ArrayRef<gmx::RVec> globalXRef = MASTER(cr) ? makeArrayRef(state_global->x) : gmx::EmptyArrayRef();
+                dd_collect_vec(cr->dd, &state->s, makeArrayRef(state->s.x), globalXRef);
+            }
+        }
+        else
+        {
+            /* Copy the local state pointer */
+            state_global = &state->s;
+        }
+
+        if (MASTER(cr))
+        {
+            if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+            {
+                /* Make molecules whole only for confout writing */
+                do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                            state_global->x.rvec_array());
+            }
+
+            write_sto_conf_mtop(confout,
+                                *top_global->name, top_global,
+                                state_global->x.rvec_array(), nullptr, ir->ePBC, state->s.box);
+        }
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(const t_commrec *cr,
+                       t_inputrec *ir, t_mdatoms *md,
+                       em_state_t *ems1, real a, const PaddedVector<gmx::RVec> *force,
+                       em_state_t *ems2,
+                       gmx::Constraints *constr,
+                       int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        ems2->f.resizeWithPadding(s2->natoms);
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = s1->x.rvec_array();
+        rvec       *x2 = s2->x.rvec_array();
+        const rvec *f  = force->rvec_array();
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = s1->cg_p.rvec_array();
+            rvec       *p2 = s2->cg_p.rvec_array();
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        dvdl_constr = 0;
+        validStep   =
+            constr->apply(TRUE, TRUE,
+                          count, 0, 1.0,
+                          s1->x.rvec_array(), s2->x.rvec_array(),
+                          nullptr, s2->box,
+                          s2->lambda[efptBONDED], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+
+        if (cr->nnodes > 1)
+        {
+            /* This global reduction will affect performance at high
+             * parallelization, but we can not really avoid it.
+             * But usually EM is not run at high parallelization.
+             */
+            int reductionBuffer = static_cast<int>(!validStep);
+            gmx_sumi(1, &reductionBuffer, cr);
+            validStep           = (reductionBuffer == 0);
+        }
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog,
+                                   const gmx::MDLogger &mdlog,
+                                   int step, const t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx::Constraints *constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+namespace
+{
+
+/*! \brief Class to handle the work of setting and doing an energy evaluation.
+ *
+ * This class is a mere aggregate of parameters to pass to evaluate an
+ * energy, so that future changes to names and types of them consume
+ * less time when refactoring other code.
+ *
+ * Aggregate initialization is used, for which the chief risk is that
+ * if a member is added at the end and not all initializer lists are
+ * updated, then the member will be value initialized, which will
+ * typically mean initialization to zero.
+ *
+ * We only want to construct one of these with an initializer list, so
+ * we explicitly delete the default constructor. */
+class EnergyEvaluator
+{
+    public:
+        //! We only intend to construct such objects with an initializer list.
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
+        // Aspects of the C++11 spec changed after GCC 4.8.5, and
+        // compilation of the initializer list construction in
+        // runner.cpp fails in GCC 4.8.5.
+        EnergyEvaluator() = delete;
+#endif
+        /*! \brief Evaluates an energy on the state in \c ems.
+         *
+         * \todo In practice, the same objects mu_tot, vir, and pres
+         * are always passed to this function, so we would rather have
+         * them as data members. However, their C-array types are
+         * unsuited for aggregate initialization. When the types
+         * improve, the call signature of this method can be reduced.
+         */
+        void run(em_state_t *ems, rvec mu_tot,
+                 tensor vir, tensor pres,
+                 int64_t count, gmx_bool bFirst);
+        //! Handles logging (deprecated).
+        FILE                 *fplog;
+        //! Handles logging.
+        const gmx::MDLogger  &mdlog;
+        //! Handles communication.
+        const t_commrec      *cr;
+        //! Coordinates multi-simulations.
+        const gmx_multisim_t *ms;
+        //! Holds the simulation topology.
+        gmx_mtop_t           *top_global;
+        //! Holds the domain topology.
+        gmx_localtop_t       *top;
+        //! User input options.
+        t_inputrec           *inputrec;
+        //! Manages flop accounting.
+        t_nrnb               *nrnb;
+        //! Manages wall cycle accounting.
+        gmx_wallcycle_t       wcycle;
+        //! Coordinates global reduction.
+        gmx_global_stat_t     gstat;
+        //! Handles virtual sites.
+        gmx_vsite_t          *vsite;
+        //! Handles constraints.
+        gmx::Constraints     *constr;
+        //! Handles strange things.
+        t_fcdata             *fcd;
+        //! Molecular graph for SHAKE.
+        t_graph              *graph;
+        //! Per-atom data for this domain.
+        gmx::MDAtoms         *mdAtoms;
+        //! Handles how to calculate the forces.
+        t_forcerec           *fr;
+        //! Schedule of force-calculation work each step for this task.
+        gmx::PpForceWorkload *ppForceWorkload;
+        //! Stores the computed energies.
+        gmx_enerdata_t       *enerd;
+};
+
+void
+EnergyEvaluator::run(em_state_t *ems, rvec mu_tot,
+                     tensor vir, tensor pres,
+                     int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    matrix plumed_vir;
+    if(plumedswitch){
+      long int lstep=count; plumed_cmd(plumedmain,"setStepLong",&lstep);
+      plumed_cmd(plumedmain,"setPositions",&ems->s.x[0][0]);
+      plumed_cmd(plumedmain,"setMasses",&mdAtoms->mdatoms()->massT[0]);
+      plumed_cmd(plumedmain,"setCharges",&mdAtoms->mdatoms()->chargeA[0]);
+      plumed_cmd(plumedmain,"setBox",&ems->s.box[0][0]);
+      plumed_cmd(plumedmain,"prepareCalc",NULL);
+      plumed_cmd(plumedmain,"setForces",&ems->f[0][0]);
+      plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      clear_mat(plumed_vir);
+      plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
+    }
+    /* END PLUMED */
+
+    do_force(fplog, cr, ms, inputrec, nullptr, nullptr,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist,
+             ems->f.arrayRefWithPadding(), force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, ppForceWorkload, vsite, mu_tot, t, nullptr,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+    /* PLUMED */
+    if(plumedswitch){
+      if(plumedNeedsEnergy) {
+        msmul(force_vir,2.0,plumed_vir);
+        plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+        plumed_cmd(plumedmain,"performCalc",NULL);
+        msmul(plumed_vir,0.5,force_vir);
+      } else {
+        msmul(plumed_vir,0.5,plumed_vir);
+        m_add(force_vir,plumed_vir,force_vir);
+      }
+    }
+    /* END PLUMED */
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        dvdl_constr = 0;
+        rvec *f_rvec = ems->f.rvec_array();
+        constr->apply(FALSE, FALSE,
+                      count, 0, 1.0,
+                      ems->s.x.rvec_array(), f_rvec, f_rvec,
+                      ems->s.box,
+                      ems->s.lambda[efptBONDED], &dvdl_constr,
+                      nullptr, &shake_vir, gmx::ConstraintVariable::ForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+} // namespace
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = s_min->f.rvec_array();
+    const rvec *fb = s_b->f.rvec_array();
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = s_min->f.rvec_array();
+        const rvec *fb  = s_b->f.rvec_array();
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+void
+Integrator::do_cg()
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating conjugate gradient energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    step = 0;
+
+    if (MASTER(cr))
+    {
+        // In CG, the state is extended with a search direction
+        state_global->flags |= (1<<estCGP);
+
+        // Ensure the extra per-atom state array gets allocated
+        state_change_natoms(state_global, state_global->natoms);
+
+        // Initialize the search direction to zero
+        for (RVec &cg_p : state_global->cg_p)
+        {
+            cg_p = { 0, 0, 0 };
+        }
+    }
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, mdlog, CG, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = s_min->s.cg_p.rvec_array();
+        const rvec *sfm = s_min->f.rvec_array();
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        auto s_min_x = makeArrayRef(s_min->s.x);
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min_x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*top_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = s_c->s.cg_p.rvec_array();
+        const rvec *sfc = s_c->f.rvec_array();
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = s_b->s.cg_p.rvec_array();
+                const rvec *sfb = s_b->f.rvec_array();
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (MASTER(cr) && do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    /* Note that with 0 < nstfout != nstxout we can end up with two frames
+     * in the trajectory with the same step number.
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+
+void
+Integrator::do_lbfgs()
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating L-BFGS energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, mdlog, LBFGS, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = (inputrec->opts.nFreeze[gf][m] != 0);
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(ems.f.rvec_array()[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, static_cast<real>(step), &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(ems.s.x.rvec_array()[0]);
+        real *ff = static_cast<real *>(ems.f.rvec_array()[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(last->s.x.data()[0]);
+        real *lastf = static_cast<real *>(last->f.data()[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(sc->s.x.rvec_array()[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(sc->f.rvec_array()[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
+        // If true, great, we found a better energy. We no longer try to alter the
+        // stepsize, but simply accept this new better position. The we select a new
+        // search direction instead, which will be much more efficient than continuing
+        // to take smaller steps along a line. Set fnorm based on the new C position,
+        // which will be used to update the stepsize to 1/fnorm further down.
+
+        // If false, the energy is NOT lower in point C, i.e. it will be the same
+        // or higher than in point A. In this case it is pointless to move to point C,
+        // so we will have to do more iterations along the same line to find a smaller
+        // value in the interval [A=0.0,C].
+        // Here, A is still 0.0, but that will change when we do a search in the interval
+        // [0.0,C] below. That search we will do by interpolation or bisection rather
+        // than with the stepsize, so no need to modify it. For the next search direction
+        // it will be reset to 1/fnorm anyway.
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(sb->s.x.rvec_array()[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(sb->f.rvec_array()[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* copy state b to c */
+                    *sc = *sb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* copy state b to a */
+                    *sa = *sb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, ems.fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+void
+Integrator::do_steep()
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating steepest-descent energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, mdlog, SD, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, count);
+        }
+
+        if (validStep)
+        {
+            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                matrix nullBox = {};
+                upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(count),
+                           mdatoms->tmass, enerd, nullptr, nullptr, nullptr,
+                           nullBox, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                          count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+}
+
+void
+Integrator::do_nm()
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*std::sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating normal-mode analysis via the integrator "
+                   ".mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx normal-modes.");
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, mdlog, NM, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<int>       atom_index = get_atom_index(top_global);
+    std::vector<gmx::RVec> fneg(atom_index.size(), {0, 0, 0});
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
+                *(top_global->name), inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS          = true;
+    auto state_work_x = makeArrayRef(state_work.s.x);
+    auto state_work_f = makeArrayRef(state_work.f);
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            int64_t     step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work_x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work_x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work_x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    relax_shell_flexcon(fplog,
+                                        cr,
+                                        ms,
+                                        mdrunOptions.verbose,
+                                        nullptr,
+                                        step,
+                                        inputrec,
+                                        bNS,
+                                        force_flags,
+                                        top,
+                                        constr,
+                                        enerd,
+                                        fcd,
+                                        &state_work.s,
+                                        state_work.f.arrayRefWithPadding(),
+                                        vir,
+                                        mdatoms,
+                                        nrnb,
+                                        wcycle,
+                                        graph,
+                                        &top_global->groups,
+                                        shellfc,
+                                        fr,
+                                        ppForceWorkload,
+                                        t,
+                                        mu_tot,
+                                        vsite,
+                                        DdOpenBalanceRegionBeforeForceComputation::no,
+                                        DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    std::copy(state_work_f.begin(), state_work_f.begin()+atom_index.size(), fneg.begin());
+                }
+            }
+
+            /* x is restored to original */
+            state_work_x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work_f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && aid+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (aid + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..77421a32af2150f87ad728007fa66bd100d3a98c
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/minimize.cpp.preplumed
@@ -0,0 +1,2988 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state                 s;
+    //! Force array
+    PaddedVector<gmx::RVec> f;
+    //! Potential energy
+    real                    epot;
+    //! Norm of the force
+    real                    fnorm;
+    //! Maximum force
+    real                    fmax;
+    //! Direction
+    int                     a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           const t_commrec          *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end_time(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE     *fp,
+                      real      ftol,
+                      real      fmax,
+                      gmx_bool  bLastStep,
+                      gmx_bool  bConstrain)
+{
+    constexpr bool realIsDouble = GMX_DOUBLE;
+    char           buffer[2048];
+
+    if (!std::isfinite(fmax))
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped because the force "
+                "on at least one atom is not finite. This usually means "
+                "atoms are overlapping. Modify the input coordinates to "
+                "remove atom overlap or use soft-core potentials with "
+                "the free energy code to avoid infinite forces.\n%s",
+                !realIsDouble ?
+                "You could also be lucky that switching to double precision "
+                "is sufficient to obtain finite forces.\n" :
+                "");
+    }
+    else if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                !realIsDouble ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+
+    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            int64_t count, gmx_bool bDone, int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(const t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->globalAtomIndices[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = gmx::roundToInt(sum[2*i+1]);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(const t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f.rvec_array(),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog,
+                    const gmx::MDLogger &mdlog,
+                    const char *title,
+                    const t_commrec *cr,
+                    const gmx_multisim_t *ms,
+                    gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx::Constraints *constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, ms, top_global, fplog, 1,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      constr ? constr->numFlexibleConstraints() : 0,
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        ems->f.resizeWithPadding(ems->s.natoms);
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  constr, vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        // TODO how should this cross-module support dependency be managed?
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constr->apply(TRUE, TRUE,
+                          -1, 0, 1.0,
+                          ems->s.x.rvec_array(),
+                          ems->s.x.rvec_array(),
+                          nullptr,
+                          ems->s.box,
+                          ems->s.lambda[efptFEP], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+}
+
+//! Finalize the minimization
+static void finish_em(const t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, const t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, static_cast<double>(step),
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr)
+    {
+        if (DOMAINDECOMP(cr))
+        {
+            /* If bX=true, x was collected to state_global in the call above */
+            if (!bX)
+            {
+                gmx::ArrayRef<gmx::RVec> globalXRef = MASTER(cr) ? makeArrayRef(state_global->x) : gmx::EmptyArrayRef();
+                dd_collect_vec(cr->dd, &state->s, makeArrayRef(state->s.x), globalXRef);
+            }
+        }
+        else
+        {
+            /* Copy the local state pointer */
+            state_global = &state->s;
+        }
+
+        if (MASTER(cr))
+        {
+            if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+            {
+                /* Make molecules whole only for confout writing */
+                do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                            state_global->x.rvec_array());
+            }
+
+            write_sto_conf_mtop(confout,
+                                *top_global->name, top_global,
+                                state_global->x.rvec_array(), nullptr, ir->ePBC, state->s.box);
+        }
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(const t_commrec *cr,
+                       t_inputrec *ir, t_mdatoms *md,
+                       em_state_t *ems1, real a, const PaddedVector<gmx::RVec> *force,
+                       em_state_t *ems2,
+                       gmx::Constraints *constr,
+                       int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        ems2->f.resizeWithPadding(s2->natoms);
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = s1->x.rvec_array();
+        rvec       *x2 = s2->x.rvec_array();
+        const rvec *f  = force->rvec_array();
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = s1->cg_p.rvec_array();
+            rvec       *p2 = s2->cg_p.rvec_array();
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        dvdl_constr = 0;
+        validStep   =
+            constr->apply(TRUE, TRUE,
+                          count, 0, 1.0,
+                          s1->x.rvec_array(), s2->x.rvec_array(),
+                          nullptr, s2->box,
+                          s2->lambda[efptBONDED], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+
+        if (cr->nnodes > 1)
+        {
+            /* This global reduction will affect performance at high
+             * parallelization, but we can not really avoid it.
+             * But usually EM is not run at high parallelization.
+             */
+            int reductionBuffer = static_cast<int>(!validStep);
+            gmx_sumi(1, &reductionBuffer, cr);
+            validStep           = (reductionBuffer == 0);
+        }
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog,
+                                   const gmx::MDLogger &mdlog,
+                                   int step, const t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx::Constraints *constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+namespace
+{
+
+/*! \brief Class to handle the work of setting and doing an energy evaluation.
+ *
+ * This class is a mere aggregate of parameters to pass to evaluate an
+ * energy, so that future changes to names and types of them consume
+ * less time when refactoring other code.
+ *
+ * Aggregate initialization is used, for which the chief risk is that
+ * if a member is added at the end and not all initializer lists are
+ * updated, then the member will be value initialized, which will
+ * typically mean initialization to zero.
+ *
+ * We only want to construct one of these with an initializer list, so
+ * we explicitly delete the default constructor. */
+class EnergyEvaluator
+{
+    public:
+        //! We only intend to construct such objects with an initializer list.
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
+        // Aspects of the C++11 spec changed after GCC 4.8.5, and
+        // compilation of the initializer list construction in
+        // runner.cpp fails in GCC 4.8.5.
+        EnergyEvaluator() = delete;
+#endif
+        /*! \brief Evaluates an energy on the state in \c ems.
+         *
+         * \todo In practice, the same objects mu_tot, vir, and pres
+         * are always passed to this function, so we would rather have
+         * them as data members. However, their C-array types are
+         * unsuited for aggregate initialization. When the types
+         * improve, the call signature of this method can be reduced.
+         */
+        void run(em_state_t *ems, rvec mu_tot,
+                 tensor vir, tensor pres,
+                 int64_t count, gmx_bool bFirst);
+        //! Handles logging (deprecated).
+        FILE                 *fplog;
+        //! Handles logging.
+        const gmx::MDLogger  &mdlog;
+        //! Handles communication.
+        const t_commrec      *cr;
+        //! Coordinates multi-simulations.
+        const gmx_multisim_t *ms;
+        //! Holds the simulation topology.
+        gmx_mtop_t           *top_global;
+        //! Holds the domain topology.
+        gmx_localtop_t       *top;
+        //! User input options.
+        t_inputrec           *inputrec;
+        //! Manages flop accounting.
+        t_nrnb               *nrnb;
+        //! Manages wall cycle accounting.
+        gmx_wallcycle_t       wcycle;
+        //! Coordinates global reduction.
+        gmx_global_stat_t     gstat;
+        //! Handles virtual sites.
+        gmx_vsite_t          *vsite;
+        //! Handles constraints.
+        gmx::Constraints     *constr;
+        //! Handles strange things.
+        t_fcdata             *fcd;
+        //! Molecular graph for SHAKE.
+        t_graph              *graph;
+        //! Per-atom data for this domain.
+        gmx::MDAtoms         *mdAtoms;
+        //! Handles how to calculate the forces.
+        t_forcerec           *fr;
+        //! Schedule of force-calculation work each step for this task.
+        gmx::PpForceWorkload *ppForceWorkload;
+        //! Stores the computed energies.
+        gmx_enerdata_t       *enerd;
+};
+
+void
+EnergyEvaluator::run(em_state_t *ems, rvec mu_tot,
+                     tensor vir, tensor pres,
+                     int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    do_force(fplog, cr, ms, inputrec, nullptr, nullptr,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist,
+             ems->f.arrayRefWithPadding(), force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, ppForceWorkload, vsite, mu_tot, t, nullptr,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        dvdl_constr = 0;
+        rvec *f_rvec = ems->f.rvec_array();
+        constr->apply(FALSE, FALSE,
+                      count, 0, 1.0,
+                      ems->s.x.rvec_array(), f_rvec, f_rvec,
+                      ems->s.box,
+                      ems->s.lambda[efptBONDED], &dvdl_constr,
+                      nullptr, &shake_vir, gmx::ConstraintVariable::ForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+} // namespace
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = s_min->f.rvec_array();
+    const rvec *fb = s_b->f.rvec_array();
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = s_min->f.rvec_array();
+        const rvec *fb  = s_b->f.rvec_array();
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+void
+Integrator::do_cg()
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating conjugate gradient energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    step = 0;
+
+    if (MASTER(cr))
+    {
+        // In CG, the state is extended with a search direction
+        state_global->flags |= (1<<estCGP);
+
+        // Ensure the extra per-atom state array gets allocated
+        state_change_natoms(state_global, state_global->natoms);
+
+        // Initialize the search direction to zero
+        for (RVec &cg_p : state_global->cg_p)
+        {
+            cg_p = { 0, 0, 0 };
+        }
+    }
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, mdlog, CG, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = s_min->s.cg_p.rvec_array();
+        const rvec *sfm = s_min->f.rvec_array();
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        auto s_min_x = makeArrayRef(s_min->s.x);
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min_x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*top_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = s_c->s.cg_p.rvec_array();
+        const rvec *sfc = s_c->f.rvec_array();
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = s_b->s.cg_p.rvec_array();
+                const rvec *sfb = s_b->f.rvec_array();
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (MASTER(cr) && do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    /* Note that with 0 < nstfout != nstxout we can end up with two frames
+     * in the trajectory with the same step number.
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+
+void
+Integrator::do_lbfgs()
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating L-BFGS energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, mdlog, LBFGS, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = (inputrec->opts.nFreeze[gf][m] != 0);
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(ems.f.rvec_array()[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, static_cast<real>(step), &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(ems.s.x.rvec_array()[0]);
+        real *ff = static_cast<real *>(ems.f.rvec_array()[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(last->s.x.data()[0]);
+        real *lastf = static_cast<real *>(last->f.data()[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(sc->s.x.rvec_array()[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(sc->f.rvec_array()[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
+        // If true, great, we found a better energy. We no longer try to alter the
+        // stepsize, but simply accept this new better position. The we select a new
+        // search direction instead, which will be much more efficient than continuing
+        // to take smaller steps along a line. Set fnorm based on the new C position,
+        // which will be used to update the stepsize to 1/fnorm further down.
+
+        // If false, the energy is NOT lower in point C, i.e. it will be the same
+        // or higher than in point A. In this case it is pointless to move to point C,
+        // so we will have to do more iterations along the same line to find a smaller
+        // value in the interval [A=0.0,C].
+        // Here, A is still 0.0, but that will change when we do a search in the interval
+        // [0.0,C] below. That search we will do by interpolation or bisection rather
+        // than with the stepsize, so no need to modify it. For the next search direction
+        // it will be reset to 1/fnorm anyway.
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(sb->s.x.rvec_array()[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(sb->f.rvec_array()[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* copy state b to c */
+                    *sc = *sb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* copy state b to a */
+                    *sa = *sb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, ems.fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+void
+Integrator::do_steep()
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating steepest-descent energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, mdlog, SD, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, count);
+        }
+
+        if (validStep)
+        {
+            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                matrix nullBox = {};
+                upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(count),
+                           mdatoms->tmass, enerd, nullptr, nullptr, nullptr,
+                           nullBox, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                          count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+}
+
+void
+Integrator::do_nm()
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*std::sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating normal-mode analysis via the integrator "
+                   ".mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx normal-modes.");
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, mdlog, NM, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<int>       atom_index = get_atom_index(top_global);
+    std::vector<gmx::RVec> fneg(atom_index.size(), {0, 0, 0});
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
+                *(top_global->name), inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS          = true;
+    auto state_work_x = makeArrayRef(state_work.s.x);
+    auto state_work_f = makeArrayRef(state_work.f);
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            int64_t     step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work_x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work_x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work_x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    relax_shell_flexcon(fplog,
+                                        cr,
+                                        ms,
+                                        mdrunOptions.verbose,
+                                        nullptr,
+                                        step,
+                                        inputrec,
+                                        bNS,
+                                        force_flags,
+                                        top,
+                                        constr,
+                                        enerd,
+                                        fcd,
+                                        &state_work.s,
+                                        state_work.f.arrayRefWithPadding(),
+                                        vir,
+                                        mdatoms,
+                                        nrnb,
+                                        wcycle,
+                                        graph,
+                                        &top_global->groups,
+                                        shellfc,
+                                        fr,
+                                        ppForceWorkload,
+                                        t,
+                                        mu_tot,
+                                        vsite,
+                                        DdOpenBalanceRegionBeforeForceComputation::no,
+                                        DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    std::copy(state_work_f.begin(), state_work_f.begin()+atom_index.size(), fneg.begin());
+                }
+            }
+
+            /* x is restored to original */
+            state_work_x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work_f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && aid+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (aid + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9d8c88f0a8b653a3cf9d8de97ff46ff00703d54
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp
@@ -0,0 +1,1487 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements the replica exchange routines.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "replicaexchange.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/collect.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+int plumed_hrex;
+/* END PLUMED HREX */
+
+//! Helps cut off probability values.
+constexpr int c_probabilityCutoff = 100;
+
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulation
+#define MSRANK(ms, nodeid)  (nodeid)
+
+//! Enum for replica exchange flavours
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+/*! \brief Strings describing replica exchange flavours.
+ *
+ *  end_single_marker merely notes the end of single variable replica
+ *  exchange. All types higher than it are multiple replica exchange
+ *  methods.
+ *
+ * Eventually, should add 'pressure', 'temperature and pressure',
+ *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
+ *  until we feel better about the pressure control methods giving
+ *  exact ensembles.  Right now, we assume constant pressure */
+static const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+
+//! Working data for replica exchange.
+struct gmx_repl_ex
+{
+    //! Replica ID
+    int       repl;
+    //! Total number of replica
+    int       nrepl;
+    //! Temperature
+    real      temp;
+    //! Replica exchange type from ere enum
+    int       type;
+    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
+    real    **q;
+    //! Use constant pressure and temperature
+    gmx_bool  bNPT;
+    //! Replica pressures
+    real     *pres;
+    //! Replica indices
+    int      *ind;
+    //! Used for keeping track of all the replica swaps
+    int      *allswaps;
+    //! Replica exchange interval (number of steps)
+    int       nst;
+    //! Number of exchanges per interval
+    int       nex;
+    //! Random seed
+    int       seed;
+    //! Number of even and odd replica change attempts
+    int       nattempt[2];
+    //! Sum of probabilities
+    real     *prob_sum;
+    //! Number of moves between replicas i and j
+    int     **nmoves;
+    //! i-th element of the array is the number of exchanges between replica i-1 and i
+    int      *nexchange;
+
+    /*! \brief Helper arrays for replica exchange; allocated here
+     * so they don't have to be allocated each time */
+    //! \{
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+    //! \}
+
+    //! Helper arrays to hold the quantities that are exchanged.
+    //! \{
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+    //! \}
+};
+
+// TODO We should add Doxygen here some time.
+//! \cond
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    /* PLUMED */
+    //bDiff = FALSE;
+    //for (s = 1; s < ms->nsim; s++)
+    //{
+    //    if (qall[s] != qall[0])
+    //    {
+              bDiff = TRUE;
+    //    }
+    //}
+    /* END PLUMED */
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (!isMultiSim(ms) || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multidir option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from isMultiSim(ms). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    /* PLUMED */
+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
+    // in those cases replicas can share the same temperature.
+    /*
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {*/
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                 /*
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+    */
+    /* END PLUMED */
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (isMasterSim(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+/* PLUMED HREX */
+void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+/* END PLUMED HREX */
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
+    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
+}
+
+/* PLUMED HREX */
+void copy_state_serial(const t_state *src, t_state *dest)
+/* END PLUMED HREX */
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
+{
+    for (auto &v : velocities)
+    {
+        v *= fac;
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, const int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, const gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+/* PLUMED HREX */
+/* this is necessary because with plumed HREX the energy contribution is
+   already taken into account */
+    if(plumed_hrex) delta=0.0;
+/* END PLUMED HREX */
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          int64_t               step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[static_cast<int>(re->q[ereLAMBDA][i])+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    /* PLUMED */
+    int plumed_test_exchange_pattern=0;
+    if(plumed_test_exchange_pattern && plumed_hrex) gmx_fatal(FARGS,"hrex not compatible with ad hoc exchange patterns");
+    /* END PLUMED */
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > c_probabilityCutoff)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        /* PLUMED */
+        if(plumedswitch){
+          int partner=re->repl;
+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
+          if(plumed_test_exchange_pattern>0){
+            int *list;
+            snew(list,re->nrepl);
+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
+            plumed_cmd(plumedmain,"getExchangesList",list);
+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
+            sfree(list);
+          }
+
+          for(i=1; i<re->nrepl; i++) {
+            if (i % 2 != m) continue;
+            a = re->ind[i-1];
+            b = re->ind[i];
+            if(re->repl==a) partner=b;
+            if(re->repl==b) partner=a;
+          }
+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
+          plumed_cmd(plumedmain,"GREX calculate",NULL);
+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
+        }
+        /* END PLUMED */
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                /* PLUMED */
+                if(plumedswitch){
+                  real adb,bdb,dplumed;
+                  char buf[300];
+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
+                  delta+=dplumed;
+                  if (bPrint)
+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
+                }
+                /* END PLUMED */
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > c_probabilityCutoff)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                  /* PLUMED */
+                  if(!plumed_test_exchange_pattern) {
+                    /* standard neighbour swapping */
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  } else {
+                    /* alternative swapping patterns */
+                    tmp       = pind[a];
+                    pind[a]   = pind[b];
+                    pind[b]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  }
+                  /* END PLUMED */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* PLUMED */
+    if(plumed_test_exchange_pattern>0) {
+      for (i = 0; i < re->nrepl; i++)
+      {
+          re->ind[i] = i;
+      }
+    }
+    /* END PLUMED */
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr,
+                          const gmx_multisim_t *ms, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    /* PLUMED */
+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
+    /* END PLUMED */
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state->v,
+                                 std::sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  (static_cast<real>(re->nexchange[i]))/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
+
+/* PLUMED HREX */
+int replica_exchange_get_repl(const gmx_repl_ex_t re){
+  return re->repl;
+};
+
+int replica_exchange_get_nrepl(const gmx_repl_ex_t re){
+  return re->nrepl;
+};
+/* END PLUMED HREX */
+//! \endcond
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a633d688c67c1755effb063557635296875bc876
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
@@ -0,0 +1,1383 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements the replica exchange routines.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "replicaexchange.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/collect.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+//! Helps cut off probability values.
+constexpr int c_probabilityCutoff = 100;
+
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulation
+#define MSRANK(ms, nodeid)  (nodeid)
+
+//! Enum for replica exchange flavours
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+/*! \brief Strings describing replica exchange flavours.
+ *
+ *  end_single_marker merely notes the end of single variable replica
+ *  exchange. All types higher than it are multiple replica exchange
+ *  methods.
+ *
+ * Eventually, should add 'pressure', 'temperature and pressure',
+ *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
+ *  until we feel better about the pressure control methods giving
+ *  exact ensembles.  Right now, we assume constant pressure */
+static const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+
+//! Working data for replica exchange.
+struct gmx_repl_ex
+{
+    //! Replica ID
+    int       repl;
+    //! Total number of replica
+    int       nrepl;
+    //! Temperature
+    real      temp;
+    //! Replica exchange type from ere enum
+    int       type;
+    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
+    real    **q;
+    //! Use constant pressure and temperature
+    gmx_bool  bNPT;
+    //! Replica pressures
+    real     *pres;
+    //! Replica indices
+    int      *ind;
+    //! Used for keeping track of all the replica swaps
+    int      *allswaps;
+    //! Replica exchange interval (number of steps)
+    int       nst;
+    //! Number of exchanges per interval
+    int       nex;
+    //! Random seed
+    int       seed;
+    //! Number of even and odd replica change attempts
+    int       nattempt[2];
+    //! Sum of probabilities
+    real     *prob_sum;
+    //! Number of moves between replicas i and j
+    int     **nmoves;
+    //! i-th element of the array is the number of exchanges between replica i-1 and i
+    int      *nexchange;
+
+    /*! \brief Helper arrays for replica exchange; allocated here
+     * so they don't have to be allocated each time */
+    //! \{
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+    //! \}
+
+    //! Helper arrays to hold the quantities that are exchanged.
+    //! \{
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+    //! \}
+};
+
+// TODO We should add Doxygen here some time.
+//! \cond
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    bDiff = FALSE;
+    for (s = 1; s < ms->nsim; s++)
+    {
+        if (qall[s] != qall[0])
+        {
+            bDiff = TRUE;
+        }
+    }
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (!isMultiSim(ms) || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multidir option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from isMultiSim(ms). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (isMasterSim(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
+    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
+}
+
+static void copy_state_serial(const t_state *src, t_state *dest)
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
+{
+    for (auto &v : velocities)
+    {
+        v *= fac;
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, const int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, const gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          int64_t               step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[static_cast<int>(re->q[ereLAMBDA][i])+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > c_probabilityCutoff)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > c_probabilityCutoff)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr,
+                          const gmx_multisim_t *ms, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state->v,
+                                 std::sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  (static_cast<real>(re->nexchange[i]))/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
+
+//! \endcond
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff7c22ff9ac47719ec2a283253792f5dfec4e1d
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h
@@ -0,0 +1,116 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routines for replica exchange.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
+#define GMX_MDRUN_REPLICAEXCHANGE_H
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/*! \libinternal
+ * \brief The parameters for the replica exchange algorithm. */
+struct ReplicaExchangeParameters
+{
+    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
+    int exchangeInterval = 0;
+    //! The number of exchanges to attempt at an exchange step.
+    int numExchanges = 0;
+    //! The random seed, -1 means generate a seed.
+    int randomSeed = -1;
+};
+
+//! Abstract type for replica exchange
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+/*! \brief Setup function.
+ *
+ * Should only be called on the master ranks */
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+
+/*! \brief Attempts replica exchange.
+ *
+ * Should be called on all ranks.  When running each replica in
+ * parallel, this routine collects the state on the master rank before
+ * exchange.  With domain decomposition, the global state after
+ * exchange is stored in state and still needs to be redistributed
+ * over the ranks.
+ *
+ * \returns TRUE if the state has been exchanged.
+ */
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          const gmx_multisim_t *ms,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          int64_t step, real time);
+
+/*! \brief Prints replica exchange statistics to the log file.
+ *
+ * Should only be called on the master ranks */
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+
+/* PLUMED HREX */
+extern int replica_exchange_get_repl(const gmx_repl_ex_t re);
+extern int replica_exchange_get_nrepl(const gmx_repl_ex_t re);
+extern void pd_collect_state(const t_commrec *cr, t_state *state);
+extern void exchange_state(const gmx_multisim_t *ms, int b, t_state *state);
+extern void copy_state_serial(const t_state *src, t_state *dest);
+/* END PLUMED HREX */
+
+#endif
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..8f4211febe707fc6d474d2a5f840c2a2c79ca0de
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
@@ -0,0 +1,108 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routines for replica exchange.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
+#define GMX_MDRUN_REPLICAEXCHANGE_H
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/*! \libinternal
+ * \brief The parameters for the replica exchange algorithm. */
+struct ReplicaExchangeParameters
+{
+    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
+    int exchangeInterval = 0;
+    //! The number of exchanges to attempt at an exchange step.
+    int numExchanges = 0;
+    //! The random seed, -1 means generate a seed.
+    int randomSeed = -1;
+};
+
+//! Abstract type for replica exchange
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+/*! \brief Setup function.
+ *
+ * Should only be called on the master ranks */
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+
+/*! \brief Attempts replica exchange.
+ *
+ * Should be called on all ranks.  When running each replica in
+ * parallel, this routine collects the state on the master rank before
+ * exchange.  With domain decomposition, the global state after
+ * exchange is stored in state and still needs to be redistributed
+ * over the ranks.
+ *
+ * \returns TRUE if the state has been exchanged.
+ */
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          const gmx_multisim_t *ms,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          int64_t step, real time);
+
+/*! \brief Prints replica exchange statistics to the log file.
+ *
+ * Should only be called on the master ranks */
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+
+#endif
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac29a3926d13b7b311cda18900a94aa5f3ffadba
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp
@@ -0,0 +1,1960 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/gmxfio.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/gpubonded.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/boxdeformation.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/makeconstraints.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/ppforceworkload.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdrun/legacymdrunoptions.h"
+#include "gromacs/mdrun/logging.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/restraint/manager.h"
+#include "gromacs/restraint/restraintmdmodule.h"
+#include "gromacs/restraint/restraintpotential.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+namespace gmx
+{
+
+/*! \brief Barrier for safe simultaneous thread access to mdrunner data
+ *
+ * Used to ensure that the master thread does not modify mdrunner during copy
+ * on the spawned threads. */
+static void threadMpiMdrunnerAccessBarrier()
+{
+#if GMX_THREAD_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+Mdrunner Mdrunner::cloneOnSpawnedThread() const
+{
+    auto newRunner = Mdrunner();
+
+    // All runners in the same process share a restraint manager resource because it is
+    // part of the interface to the client code, which is associated only with the
+    // original thread. Handles to the same resources can be obtained by copy.
+    {
+        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+    }
+
+    // Copy original cr pointer before master thread can pass the thread barrier
+    newRunner.cr  = reinitialize_commrec_for_this_thread(cr);
+
+    // Copy members of master runner.
+    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
+    // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
+    newRunner.hw_opt    = hw_opt;
+    newRunner.filenames = filenames;
+
+    newRunner.oenv                = oenv;
+    newRunner.mdrunOptions        = mdrunOptions;
+    newRunner.domdecOptions       = domdecOptions;
+    newRunner.nbpu_opt            = nbpu_opt;
+    newRunner.pme_opt             = pme_opt;
+    newRunner.pme_fft_opt         = pme_fft_opt;
+    newRunner.bonded_opt          = bonded_opt;
+    newRunner.nstlist_cmdline     = nstlist_cmdline;
+    newRunner.replExParams        = replExParams;
+    newRunner.pforce              = pforce;
+    newRunner.ms                  = ms;
+    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+
+    threadMpiMdrunnerAccessBarrier();
+
+    GMX_RELEASE_ASSERT(!MASTER(newRunner.cr), "cloneOnSpawnedThread should only be called on spawned threads");
+
+    return newRunner;
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(const void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course; fnm, cr and fplog
+           are reset in the call below, all others should be const. */
+        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+
+    threadMpiMdrunnerAccessBarrier();
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}  // namespace gmx
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    int64_t              nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    t_nrnb                   *nrnb;
+    t_forcerec               *fr               = nullptr;
+    t_fcdata                 *fcd              = nullptr;
+    real                      ewaldcoeff_q     = 0;
+    real                      ewaldcoeff_lj    = 0;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    int64_t                   reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    std::unique_ptr<gmx::MDModules> mdModules(new gmx::MDModules);
+    t_inputrec                      inputrecInstance;
+    t_inputrec                     *inputrec = &inputrecInstance;
+    gmx_mtop_t                      mtop;
+
+    bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    auto       bondedTarget    = findTaskTarget(bonded_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+
+    FILE *fplog = nullptr;
+    // If we are appending, we don't write log output because we need
+    // to check that the old log file matches what the checkpoint file
+    // expects. Otherwise, we should start to write log output now if
+    // there is a file ready for it.
+    if (logFileHandle != nullptr && !mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = gmx_fio_getfp(logFileHandle);
+    }
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    // TODO The thread-MPI master rank makes a working
+    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+    // after the threads have been launched. This works because no use
+    // is made of that communicator until after the execution paths
+    // have rejoined. But it is likely that we can improve the way
+    // this is expressed, e.g. by expressly running detection only the
+    // master rank for thread-MPI, rather than relying on the mutex
+    // and reference count.
+    PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
+
+    gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+        writeSourceDoi(fplog);
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = compat::make_unique<t_state>();
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
+
+        /* In rerun, set velocities to zero if present */
+        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        {
+            // rerun does not use velocities
+            GMX_LOG(mdlog.info).asParagraph().appendText(
+                    "Rerun trajectory contains velocities. Rerun does only evaluate "
+                    "potential energy and forces. The velocities will be ignored.");
+            for (int i = 0; i < globalState->natoms; i++)
+            {
+                clear_rvec(globalState->v[i]);
+            }
+            globalState->flags &= ~(1 << estV);
+        }
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    canUseGpuForNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, &mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+        physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    }
+    // END OF CAUTION: cr and physicalNodeComm are now reliable
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, &mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    //
+    // Note that when bonded interactions run on a GPU they always run
+    // alongside a nonbonded task, so do not influence task assignment
+    // even though they affect the force calculation workload.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    bool useGpuForBonded    = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
+        bool usingVerletScheme     = inputrec->cutoff_scheme == ecutsVERLET;
+        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded,
+                                                                canUseGpuForNonbonded,
+                                                                usingVerletScheme,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    *hwinfo, *inputrec, mtop,
+                                                    cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+        auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
+        useGpuForBonded =
+            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme, usingVerletScheme,
+                                            bondedTarget, canUseGpuForBonded,
+                                            EVDW_PME(inputrec->vdwtype),
+                                            EEL_PME_EWALD(inputrec->coulombtype),
+                                            domdecOptions.numPmeRanks, gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // Build restraints.
+    // TODO: hide restraint implementation details from Mdrunner.
+    // There is nothing unique about restraints at this point as far as the
+    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
+    // factory functions from the SimulationContext on which to call mdModules->add().
+    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
+    for (auto && restraint : restraintManager_->getRestraints())
+    {
+        auto module = RestraintMDModule::create(restraint,
+                                                restraint->sites());
+        mdModules->add(std::move(module));
+    }
+
+    // TODO: Error handling
+    mdModules->assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = compat::make_unique<t_state>();
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
+#endif
+#endif
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#if GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
+
+    auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        if (mdrunOptions.numStepsCommandline > -2)
+        {
+            /* Temporarily set the number of steps to unmlimited to avoid
+             * triggering the nsteps check in load_checkpoint().
+             * This hack will go away soon when the -nsteps option is removed.
+             */
+            inputrec->nsteps = -1;
+        }
+
+        load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
+                        logFileHandle,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+
+        if (continuationOptions.appendFiles && logFileHandle)
+        {
+            // Now we can start normal logging to the truncated log file.
+            fplog    = gmx_fio_getfp(logFileHandle);
+            prepareLogAppending(fplog);
+            logOwner = buildLogger(fplog, cr);
+            mdlog    = logOwner.logger();
+        }
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    LocalAtomSetManager atomSets;
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
+                                           &mtop, inputrec,
+                                           box, positionsFromStatePointer(globalState.get()),
+                                           &atomSets);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=screw is only implemented with domain decomposition");
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+#if GMX_MPI
+    if (isMultiSim(ms))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                ms->sim, ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+                                            pmeRunMode, mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          physicalNodeComm.size_,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    // Enable FP exception detection for the Verlet scheme, but not in
+    // Release mode and not for compilers with known buggy FP
+    // exception support (clang with any optimization) or suspected
+    // buggy FP exception support (gcc 7.* with optimization).
+#if !defined NDEBUG && \
+    !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
+    && defined __OPTIMIZE__)
+    const bool bEnableFPE = inputrec->cutoff_scheme == ecutsVERLET;
+#else
+    const bool bEnableFPE = false;
+#endif
+    //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
+    if (bEnableFPE)
+    {
+        gmx_feenableexcept();
+    }
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become known.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            // Note that any bonded tasks on a GPU always accompany a
+            // non-bonded task.
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+            else if (bondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank,
+                                              useGpuForBonded, pmeRunMode);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (isMultiSim(ms))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    std::unique_ptr<ClfftInitializer> initializedClfftLibrary;
+
+    gmx_device_info_t                *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
+    // This works because only one task of each type is currently permitted.
+    auto                 pmeGpuTaskMapping     = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    const bool           thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
+    if (thisRankHasPmeGpuTask)
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+        // TODO It would be nice to move this logic into the factory
+        // function. See Redmine #2535.
+        bool isMasterThread = !GMX_THREAD_MPI || MASTER(cr);
+        if (pmeRunMode == PmeRunMode::GPU && !initializedClfftLibrary && isMasterThread)
+        {
+            initializedClfftLibrary = initializeClfftLibrary();
+        }
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
+                                  *hwinfo->hardwareTopology,
+                                  physicalNodeComm, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        int numThreadsOnThisNode, intraNodeThreadOffset;
+        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
+                                 &intraNodeThreadOffset);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, numThreadsOnThisNode,
+                                intraNodeThreadOffset, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
+                             &mdrunOptions
+                                 .checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms>     mdAtoms;
+    std::unique_ptr<gmx_vsite_t> vsite;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules->initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, &mtop, cr, box,
+                      opt2fn("-table", filenames.size(), filenames.data()),
+                      opt2fn("-tablep", filenames.size(), filenames.data()),
+                      opt2fns("-tableb", filenames.size(), filenames.data()),
+                      *hwinfo, nonbondedDeviceInfo,
+                      useGpuForBonded,
+                      FALSE,
+                      pforce);
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
+        if (globalState && thisRankHasPmeGpuTask)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr,
+                                       getNumPmeDomains(cr->dd),
+                                       inputrec,
+                                       mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec,
+                          &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
+            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
+            {
+                initPullHistory(inputrec->pull_work, &observablesHistory);
+            }
+            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
+            {
+                init_pull_output_files(inputrec->pull_work,
+                                       filenames.size(), filenames.data(), oenv,
+                                       continuationOptions);
+            }
+        }
+
+        std::unique_ptr<EnforcedRotation> enforcedRotation;
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            enforcedRotation = init_rot(fplog,
+                                        inputrec,
+                                        filenames.size(),
+                                        filenames.data(),
+                                        cr,
+                                        &atomSets,
+                                        globalState.get(),
+                                        &mtop,
+                                        oenv,
+                                        mdrunOptions);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, inputrec, opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
+                            &mtop, globalState.get(), &observablesHistory,
+                            cr, &atomSets, oenv, mdrunOptions);
+        }
+
+        /* Let makeConstraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
+                                    || observablesHistory.edsamHistory);
+        auto constr              = makeConstraints(mtop, *inputrec, doEssentialDynamics,
+                                                   fplog, *mdAtoms->mdatoms(),
+                                                   cr, ms, nrnb, wcycle, fr->bMolPBC);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work. Later, it should probably be made/updated
+        // after the workload for the lifetime of a PP domain is
+        // understood.
+        PpForceWorkload ppForceWorkload;
+
+        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to integrator.");
+
+        /* PLUMED */
+        if(plumedswitch){
+          /* detect plumed API version */
+          int pversion=0;
+          plumed_cmd(plumedmain,"getApiVersion",&pversion);
+          if(pversion>5) {
+             int nth = gmx_omp_nthreads_get(emntDefault);
+             if(pversion>5) plumed_cmd(plumedmain,"setNumOMPthreads",&nth);
+          }
+        }
+        /* END PLUMED */
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        Integrator integrator {
+            fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
+            oenv,
+            mdrunOptions,
+            vsite.get(), constr.get(),
+            enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
+            deform.get(),
+            mdModules->outputProvider(),
+            inputrec, &mtop,
+            fcd,
+            globalState.get(),
+            &observablesHistory,
+            mdAtoms.get(), nrnb, wcycle, fr,
+            &ppForceWorkload,
+            replExParams,
+            membed,
+            walltime_accounting,
+            std::move(stopHandlerBuilder_)
+        };
+        integrator.run(inputrec->eI, doRerun);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+    mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, physicalNodeComm);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+    done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr);
+    sfree(fcd);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+    sfree(nrnb);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    // Ensure log file content is written
+    if (logFileHandle)
+    {
+        gmx_fio_flush(logFileHandle);
+    }
+
+    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
+     * exceptions were enabled before function was called. */
+    if (bEnableFPE)
+    {
+        gmx_fedisableexcept();
+    }
+
+    rc = static_cast<int>(gmx_get_stop_condition());
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        done_commrec(cr);
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+Mdrunner::~Mdrunner()
+{
+    // Clean up of the Manager.
+    // This will end up getting called on every thread-MPI rank, which is unnecessary,
+    // but okay as long as threads synchronize some time before adding or accessing
+    // a new set of restraints.
+    if (restraintManager_)
+    {
+        restraintManager_->clear();
+        GMX_ASSERT(restraintManager_->countRestraints() == 0,
+                   "restraints added during runner life time should be cleared at runner destruction.");
+    }
+};
+
+void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
+                            std::string                               name)
+{
+    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
+    // Not sure if this should be logged through the md logger or something else,
+    // but it is helpful to have some sort of INFO level message sent somewhere.
+    //    std::cout << "Registering restraint named " << name << std::endl;
+
+    // When multiple restraints are used, it may be wasteful to register them separately.
+    // Maybe instead register an entire Restraint Manager as a force provider.
+    restraintManager_->addToSpec(std::move(puller),
+                                 std::move(name));
+}
+
+Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
+
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
+Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
+
+class Mdrunner::BuilderImplementation
+{
+    public:
+        BuilderImplementation() = delete;
+        explicit BuilderImplementation(SimulationContext* context);
+        ~BuilderImplementation();
+
+        BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
+                                                    real                forceWarningThreshold);
+
+        void addDomdec(const DomdecOptions &options);
+
+        void addVerletList(int nstlist);
+
+        void addReplicaExchange(const ReplicaExchangeParameters &params);
+
+        void addMultiSim(gmx_multisim_t* multisim);
+
+        void addNonBonded(const char* nbpu_opt);
+
+        void addPME(const char* pme_opt_, const char* pme_fft_opt_);
+
+        void addBondedTaskAssignment(const char* bonded_opt);
+
+        void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
+
+        void addFilenames(ArrayRef <const t_filenm> filenames);
+
+        void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
+
+        void addLogFile(t_fileio *logFileHandle);
+
+        void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
+
+        Mdrunner build();
+
+    private:
+        // Default parameters copied from runner.h
+        // \todo Clarify source(s) of default parameters.
+
+        const char* nbpu_opt_    = nullptr;
+        const char* pme_opt_     = nullptr;
+        const char* pme_fft_opt_ = nullptr;
+        const char *bonded_opt_  = nullptr;
+
+        MdrunOptions                          mdrunOptions_;
+
+        DomdecOptions                         domdecOptions_;
+
+        ReplicaExchangeParameters             replicaExchangeParameters_;
+
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int         nstlist_ = 0;
+
+        //! Non-owning multisim communicator handle.
+        std::unique_ptr<gmx_multisim_t*>      multisim_ = nullptr;
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real forceWarningThreshold_ = -1;
+
+        /*! \brief  Non-owning pointer to SimulationContext (owned and managed by client)
+         *
+         * \internal
+         * \todo Establish robust protocol to make sure resources remain valid.
+         * SimulationContext will likely be separated into multiple layers for
+         * different levels of access and different phases of execution. Ref
+         * https://redmine.gromacs.org/issues/2375
+         * https://redmine.gromacs.org/issues/2587
+         */
+        SimulationContext* context_ = nullptr;
+
+        //! \brief Parallelism information.
+        gmx_hw_opt_t hardwareOptions_;
+
+        //! filename options for simulation.
+        ArrayRef<const t_filenm> filenames_;
+
+        /*! \brief Handle to output environment.
+         *
+         * \todo gmx_output_env_t needs lifetime management.
+         */
+        gmx_output_env_t*    outputEnvironment_ = nullptr;
+
+        /*! \brief Non-owning handle to MD log file.
+         *
+         * \todo Context should own output facilities for client.
+         * \todo Improve log file handle management.
+         * \internal
+         * Code managing the FILE* relies on the ability to set it to
+         * nullptr to check whether the filehandle is valid.
+         */
+        t_fileio* logFileHandle_ = nullptr;
+
+        /*!
+         * \brief Builder for simulation stop signal handler.
+         */
+        std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+};
+
+Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* context) :
+    context_(context)
+{
+    GMX_ASSERT(context_, "Bug found. It should not be possible to construct builder without a valid context.");
+}
+
+Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
+
+Mdrunner::BuilderImplementation &
+Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    mdrunOptions_          = options;
+    forceWarningThreshold_ = forceWarningThreshold;
+    return *this;
+}
+
+void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
+{
+    domdecOptions_ = options;
+}
+
+void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
+{
+    nstlist_ = nstlist;
+}
+
+void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    replicaExchangeParameters_ = params;
+}
+
+void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
+{
+    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+}
+
+Mdrunner Mdrunner::BuilderImplementation::build()
+{
+    auto newRunner = Mdrunner();
+
+    GMX_ASSERT(context_, "Bug found. It should not be possible to call build() without a valid context.");
+
+    newRunner.mdrunOptions    = mdrunOptions_;
+    newRunner.domdecOptions   = domdecOptions_;
+
+    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
+    newRunner.hw_opt          = hardwareOptions_;
+
+    // No invariant to check. This parameter exists to optionally override other behavior.
+    newRunner.nstlist_cmdline = nstlist_;
+
+    newRunner.replExParams    = replicaExchangeParameters_;
+
+    newRunner.filenames = filenames_;
+
+    GMX_ASSERT(context_->communicationRecord_, "SimulationContext communications not initialized.");
+    newRunner.cr = context_->communicationRecord_;
+
+    if (multisim_)
+    {
+        // nullptr is a valid value for the multisim handle, so we don't check the pointed-to pointer.
+        newRunner.ms = *multisim_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addMultiSim() is required before build()"));
+    }
+
+    // \todo Clarify ownership and lifetime management for gmx_output_env_t
+    // \todo Update sanity checking when output environment has clearly specified invariants.
+    // Initialization and default values for oenv are not well specified in the current version.
+    if (outputEnvironment_)
+    {
+        newRunner.oenv  = outputEnvironment_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
+    }
+
+    newRunner.logFileHandle = logFileHandle_;
+
+    if (nbpu_opt_)
+    {
+        newRunner.nbpu_opt    = nbpu_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
+    }
+
+    if (pme_opt_ && pme_fft_opt_)
+    {
+        newRunner.pme_opt     = pme_opt_;
+        newRunner.pme_fft_opt = pme_fft_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
+    }
+
+    if (bonded_opt_)
+    {
+        newRunner.bonded_opt = bonded_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
+    }
+
+    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+
+    if (stopHandlerBuilder_)
+    {
+        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
+    }
+    else
+    {
+        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+    }
+
+    return newRunner;
+}
+
+void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
+{
+    nbpu_opt_ = nbpu_opt;
+}
+
+void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
+                                             const char* pme_fft_opt)
+{
+    pme_opt_     = pme_opt;
+    pme_fft_opt_ = pme_fft_opt;
+}
+
+void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
+{
+    bonded_opt_ = bonded_opt;
+}
+
+void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    hardwareOptions_ = hardwareOptions;
+}
+
+void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    filenames_ = filenames;
+}
+
+void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    outputEnvironment_ = outputEnvironment;
+}
+
+void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
+{
+    logFileHandle_ = logFileHandle;
+}
+
+void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    stopHandlerBuilder_ = std::move(builder);
+}
+
+MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
+    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+{
+}
+
+MdrunnerBuilder::~MdrunnerBuilder() = default;
+
+MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    impl_->setExtraMdrunOptions(options, forceWarningThreshold);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
+{
+    impl_->addDomdec(options);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
+{
+    impl_->addVerletList(nstlist);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    impl_->addReplicaExchange(params);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addMultiSim(gmx_multisim_t* multisim)
+{
+    impl_->addMultiSim(multisim);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
+{
+    impl_->addNonBonded(nbpu_opt);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
+                                                    const char* pme_fft_opt)
+{
+    // The builder method may become more general in the future, but in this version,
+    // parameters for PME electrostatics are both required and the only parameters
+    // available.
+    if (pme_opt && pme_fft_opt)
+    {
+        impl_->addPME(pme_opt, pme_fft_opt);
+    }
+    else
+    {
+        GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
+    }
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
+{
+    impl_->addBondedTaskAssignment(bonded_opt);
+    return *this;
+}
+
+Mdrunner MdrunnerBuilder::build()
+{
+    return impl_->build();
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    impl_->addHardwareOptions(hardwareOptions);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    impl_->addFilenames(filenames);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    impl_->addOutputEnvironment(outputEnvironment);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
+{
+    impl_->addLogFile(logFileHandle);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    impl_->addStopHandlerBuilder(std::move(builder));
+    return *this;
+}
+
+MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
+
+MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp.preplumed b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..370cff95f94c657027b79afd16368f194ac41918
--- /dev/null
+++ b/patches/gromacs-2019.2.diff/src/gromacs/mdrun/runner.cpp.preplumed
@@ -0,0 +1,1941 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/gmxfio.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/gpubonded.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/boxdeformation.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/makeconstraints.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/ppforceworkload.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdrun/legacymdrunoptions.h"
+#include "gromacs/mdrun/logging.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/restraint/manager.h"
+#include "gromacs/restraint/restraintmdmodule.h"
+#include "gromacs/restraint/restraintpotential.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+namespace gmx
+{
+
+/*! \brief Barrier for safe simultaneous thread access to mdrunner data
+ *
+ * Used to ensure that the master thread does not modify mdrunner during copy
+ * on the spawned threads. */
+static void threadMpiMdrunnerAccessBarrier()
+{
+#if GMX_THREAD_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+Mdrunner Mdrunner::cloneOnSpawnedThread() const
+{
+    auto newRunner = Mdrunner();
+
+    // All runners in the same process share a restraint manager resource because it is
+    // part of the interface to the client code, which is associated only with the
+    // original thread. Handles to the same resources can be obtained by copy.
+    {
+        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+    }
+
+    // Copy original cr pointer before master thread can pass the thread barrier
+    newRunner.cr  = reinitialize_commrec_for_this_thread(cr);
+
+    // Copy members of master runner.
+    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
+    // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
+    newRunner.hw_opt    = hw_opt;
+    newRunner.filenames = filenames;
+
+    newRunner.oenv                = oenv;
+    newRunner.mdrunOptions        = mdrunOptions;
+    newRunner.domdecOptions       = domdecOptions;
+    newRunner.nbpu_opt            = nbpu_opt;
+    newRunner.pme_opt             = pme_opt;
+    newRunner.pme_fft_opt         = pme_fft_opt;
+    newRunner.bonded_opt          = bonded_opt;
+    newRunner.nstlist_cmdline     = nstlist_cmdline;
+    newRunner.replExParams        = replExParams;
+    newRunner.pforce              = pforce;
+    newRunner.ms                  = ms;
+    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+
+    threadMpiMdrunnerAccessBarrier();
+
+    GMX_RELEASE_ASSERT(!MASTER(newRunner.cr), "cloneOnSpawnedThread should only be called on spawned threads");
+
+    return newRunner;
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(const void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course; fnm, cr and fplog
+           are reset in the call below, all others should be const. */
+        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+
+    threadMpiMdrunnerAccessBarrier();
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}  // namespace gmx
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    int64_t              nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    t_nrnb                   *nrnb;
+    t_forcerec               *fr               = nullptr;
+    t_fcdata                 *fcd              = nullptr;
+    real                      ewaldcoeff_q     = 0;
+    real                      ewaldcoeff_lj    = 0;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    int64_t                   reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    std::unique_ptr<gmx::MDModules> mdModules(new gmx::MDModules);
+    t_inputrec                      inputrecInstance;
+    t_inputrec                     *inputrec = &inputrecInstance;
+    gmx_mtop_t                      mtop;
+
+    bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    auto       bondedTarget    = findTaskTarget(bonded_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+
+    FILE *fplog = nullptr;
+    // If we are appending, we don't write log output because we need
+    // to check that the old log file matches what the checkpoint file
+    // expects. Otherwise, we should start to write log output now if
+    // there is a file ready for it.
+    if (logFileHandle != nullptr && !mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = gmx_fio_getfp(logFileHandle);
+    }
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    // TODO The thread-MPI master rank makes a working
+    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+    // after the threads have been launched. This works because no use
+    // is made of that communicator until after the execution paths
+    // have rejoined. But it is likely that we can improve the way
+    // this is expressed, e.g. by expressly running detection only the
+    // master rank for thread-MPI, rather than relying on the mutex
+    // and reference count.
+    PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
+
+    gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+        writeSourceDoi(fplog);
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = compat::make_unique<t_state>();
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
+
+        /* In rerun, set velocities to zero if present */
+        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        {
+            // rerun does not use velocities
+            GMX_LOG(mdlog.info).asParagraph().appendText(
+                    "Rerun trajectory contains velocities. Rerun does only evaluate "
+                    "potential energy and forces. The velocities will be ignored.");
+            for (int i = 0; i < globalState->natoms; i++)
+            {
+                clear_rvec(globalState->v[i]);
+            }
+            globalState->flags &= ~(1 << estV);
+        }
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    canUseGpuForNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, &mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+        physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    }
+    // END OF CAUTION: cr and physicalNodeComm are now reliable
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, &mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    //
+    // Note that when bonded interactions run on a GPU they always run
+    // alongside a nonbonded task, so do not influence task assignment
+    // even though they affect the force calculation workload.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    bool useGpuForBonded    = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
+        bool usingVerletScheme     = inputrec->cutoff_scheme == ecutsVERLET;
+        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded,
+                                                                canUseGpuForNonbonded,
+                                                                usingVerletScheme,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    *hwinfo, *inputrec, mtop,
+                                                    cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+        auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
+        useGpuForBonded =
+            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme, usingVerletScheme,
+                                            bondedTarget, canUseGpuForBonded,
+                                            EVDW_PME(inputrec->vdwtype),
+                                            EEL_PME_EWALD(inputrec->coulombtype),
+                                            domdecOptions.numPmeRanks, gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // Build restraints.
+    // TODO: hide restraint implementation details from Mdrunner.
+    // There is nothing unique about restraints at this point as far as the
+    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
+    // factory functions from the SimulationContext on which to call mdModules->add().
+    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
+    for (auto && restraint : restraintManager_->getRestraints())
+    {
+        auto module = RestraintMDModule::create(restraint,
+                                                restraint->sites());
+        mdModules->add(std::move(module));
+    }
+
+    // TODO: Error handling
+    mdModules->assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = compat::make_unique<t_state>();
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
+#endif
+#endif
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#if GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
+
+    auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        if (mdrunOptions.numStepsCommandline > -2)
+        {
+            /* Temporarily set the number of steps to unmlimited to avoid
+             * triggering the nsteps check in load_checkpoint().
+             * This hack will go away soon when the -nsteps option is removed.
+             */
+            inputrec->nsteps = -1;
+        }
+
+        load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
+                        logFileHandle,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+
+        if (continuationOptions.appendFiles && logFileHandle)
+        {
+            // Now we can start normal logging to the truncated log file.
+            fplog    = gmx_fio_getfp(logFileHandle);
+            prepareLogAppending(fplog);
+            logOwner = buildLogger(fplog, cr);
+            mdlog    = logOwner.logger();
+        }
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    LocalAtomSetManager atomSets;
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
+                                           &mtop, inputrec,
+                                           box, positionsFromStatePointer(globalState.get()),
+                                           &atomSets);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=screw is only implemented with domain decomposition");
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+#if GMX_MPI
+    if (isMultiSim(ms))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                ms->sim, ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+                                            pmeRunMode, mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          physicalNodeComm.size_,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    // Enable FP exception detection for the Verlet scheme, but not in
+    // Release mode and not for compilers with known buggy FP
+    // exception support (clang with any optimization) or suspected
+    // buggy FP exception support (gcc 7.* with optimization).
+#if !defined NDEBUG && \
+    !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
+    && defined __OPTIMIZE__)
+    const bool bEnableFPE = inputrec->cutoff_scheme == ecutsVERLET;
+#else
+    const bool bEnableFPE = false;
+#endif
+    //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
+    if (bEnableFPE)
+    {
+        gmx_feenableexcept();
+    }
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become known.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            // Note that any bonded tasks on a GPU always accompany a
+            // non-bonded task.
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+            else if (bondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank,
+                                              useGpuForBonded, pmeRunMode);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (isMultiSim(ms))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    std::unique_ptr<ClfftInitializer> initializedClfftLibrary;
+
+    gmx_device_info_t                *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
+    // This works because only one task of each type is currently permitted.
+    auto                 pmeGpuTaskMapping     = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    const bool           thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
+    if (thisRankHasPmeGpuTask)
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+        // TODO It would be nice to move this logic into the factory
+        // function. See Redmine #2535.
+        bool isMasterThread = !GMX_THREAD_MPI || MASTER(cr);
+        if (pmeRunMode == PmeRunMode::GPU && !initializedClfftLibrary && isMasterThread)
+        {
+            initializedClfftLibrary = initializeClfftLibrary();
+        }
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
+                                  *hwinfo->hardwareTopology,
+                                  physicalNodeComm, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        int numThreadsOnThisNode, intraNodeThreadOffset;
+        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
+                                 &intraNodeThreadOffset);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, numThreadsOnThisNode,
+                                intraNodeThreadOffset, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
+                             &mdrunOptions
+                                 .checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms>     mdAtoms;
+    std::unique_ptr<gmx_vsite_t> vsite;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules->initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, &mtop, cr, box,
+                      opt2fn("-table", filenames.size(), filenames.data()),
+                      opt2fn("-tablep", filenames.size(), filenames.data()),
+                      opt2fns("-tableb", filenames.size(), filenames.data()),
+                      *hwinfo, nonbondedDeviceInfo,
+                      useGpuForBonded,
+                      FALSE,
+                      pforce);
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
+        if (globalState && thisRankHasPmeGpuTask)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr,
+                                       getNumPmeDomains(cr->dd),
+                                       inputrec,
+                                       mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec,
+                          &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
+            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
+            {
+                initPullHistory(inputrec->pull_work, &observablesHistory);
+            }
+            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
+            {
+                init_pull_output_files(inputrec->pull_work,
+                                       filenames.size(), filenames.data(), oenv,
+                                       continuationOptions);
+            }
+        }
+
+        std::unique_ptr<EnforcedRotation> enforcedRotation;
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            enforcedRotation = init_rot(fplog,
+                                        inputrec,
+                                        filenames.size(),
+                                        filenames.data(),
+                                        cr,
+                                        &atomSets,
+                                        globalState.get(),
+                                        &mtop,
+                                        oenv,
+                                        mdrunOptions);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, inputrec, opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
+                            &mtop, globalState.get(), &observablesHistory,
+                            cr, &atomSets, oenv, mdrunOptions);
+        }
+
+        /* Let makeConstraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
+                                    || observablesHistory.edsamHistory);
+        auto constr              = makeConstraints(mtop, *inputrec, doEssentialDynamics,
+                                                   fplog, *mdAtoms->mdatoms(),
+                                                   cr, ms, nrnb, wcycle, fr->bMolPBC);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work. Later, it should probably be made/updated
+        // after the workload for the lifetime of a PP domain is
+        // understood.
+        PpForceWorkload ppForceWorkload;
+
+        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to integrator.");
+        /* Now do whatever the user wants us to do (how flexible...) */
+        Integrator integrator {
+            fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
+            oenv,
+            mdrunOptions,
+            vsite.get(), constr.get(),
+            enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
+            deform.get(),
+            mdModules->outputProvider(),
+            inputrec, &mtop,
+            fcd,
+            globalState.get(),
+            &observablesHistory,
+            mdAtoms.get(), nrnb, wcycle, fr,
+            &ppForceWorkload,
+            replExParams,
+            membed,
+            walltime_accounting,
+            std::move(stopHandlerBuilder_)
+        };
+        integrator.run(inputrec->eI, doRerun);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+    mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, physicalNodeComm);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+    done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr);
+    sfree(fcd);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+    sfree(nrnb);
+
+    // Ensure log file content is written
+    if (logFileHandle)
+    {
+        gmx_fio_flush(logFileHandle);
+    }
+
+    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
+     * exceptions were enabled before function was called. */
+    if (bEnableFPE)
+    {
+        gmx_fedisableexcept();
+    }
+
+    rc = static_cast<int>(gmx_get_stop_condition());
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        done_commrec(cr);
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+Mdrunner::~Mdrunner()
+{
+    // Clean up of the Manager.
+    // This will end up getting called on every thread-MPI rank, which is unnecessary,
+    // but okay as long as threads synchronize some time before adding or accessing
+    // a new set of restraints.
+    if (restraintManager_)
+    {
+        restraintManager_->clear();
+        GMX_ASSERT(restraintManager_->countRestraints() == 0,
+                   "restraints added during runner life time should be cleared at runner destruction.");
+    }
+};
+
+void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
+                            std::string                               name)
+{
+    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
+    // Not sure if this should be logged through the md logger or something else,
+    // but it is helpful to have some sort of INFO level message sent somewhere.
+    //    std::cout << "Registering restraint named " << name << std::endl;
+
+    // When multiple restraints are used, it may be wasteful to register them separately.
+    // Maybe instead register an entire Restraint Manager as a force provider.
+    restraintManager_->addToSpec(std::move(puller),
+                                 std::move(name));
+}
+
+Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
+
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
+Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
+
+class Mdrunner::BuilderImplementation
+{
+    public:
+        BuilderImplementation() = delete;
+        explicit BuilderImplementation(SimulationContext* context);
+        ~BuilderImplementation();
+
+        BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
+                                                    real                forceWarningThreshold);
+
+        void addDomdec(const DomdecOptions &options);
+
+        void addVerletList(int nstlist);
+
+        void addReplicaExchange(const ReplicaExchangeParameters &params);
+
+        void addMultiSim(gmx_multisim_t* multisim);
+
+        void addNonBonded(const char* nbpu_opt);
+
+        void addPME(const char* pme_opt_, const char* pme_fft_opt_);
+
+        void addBondedTaskAssignment(const char* bonded_opt);
+
+        void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
+
+        void addFilenames(ArrayRef <const t_filenm> filenames);
+
+        void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
+
+        void addLogFile(t_fileio *logFileHandle);
+
+        void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
+
+        Mdrunner build();
+
+    private:
+        // Default parameters copied from runner.h
+        // \todo Clarify source(s) of default parameters.
+
+        const char* nbpu_opt_    = nullptr;
+        const char* pme_opt_     = nullptr;
+        const char* pme_fft_opt_ = nullptr;
+        const char *bonded_opt_  = nullptr;
+
+        MdrunOptions                          mdrunOptions_;
+
+        DomdecOptions                         domdecOptions_;
+
+        ReplicaExchangeParameters             replicaExchangeParameters_;
+
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int         nstlist_ = 0;
+
+        //! Non-owning multisim communicator handle.
+        std::unique_ptr<gmx_multisim_t*>      multisim_ = nullptr;
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real forceWarningThreshold_ = -1;
+
+        /*! \brief  Non-owning pointer to SimulationContext (owned and managed by client)
+         *
+         * \internal
+         * \todo Establish robust protocol to make sure resources remain valid.
+         * SimulationContext will likely be separated into multiple layers for
+         * different levels of access and different phases of execution. Ref
+         * https://redmine.gromacs.org/issues/2375
+         * https://redmine.gromacs.org/issues/2587
+         */
+        SimulationContext* context_ = nullptr;
+
+        //! \brief Parallelism information.
+        gmx_hw_opt_t hardwareOptions_;
+
+        //! filename options for simulation.
+        ArrayRef<const t_filenm> filenames_;
+
+        /*! \brief Handle to output environment.
+         *
+         * \todo gmx_output_env_t needs lifetime management.
+         */
+        gmx_output_env_t*    outputEnvironment_ = nullptr;
+
+        /*! \brief Non-owning handle to MD log file.
+         *
+         * \todo Context should own output facilities for client.
+         * \todo Improve log file handle management.
+         * \internal
+         * Code managing the FILE* relies on the ability to set it to
+         * nullptr to check whether the filehandle is valid.
+         */
+        t_fileio* logFileHandle_ = nullptr;
+
+        /*!
+         * \brief Builder for simulation stop signal handler.
+         */
+        std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+};
+
+Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* context) :
+    context_(context)
+{
+    GMX_ASSERT(context_, "Bug found. It should not be possible to construct builder without a valid context.");
+}
+
+Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
+
+Mdrunner::BuilderImplementation &
+Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    mdrunOptions_          = options;
+    forceWarningThreshold_ = forceWarningThreshold;
+    return *this;
+}
+
+void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
+{
+    domdecOptions_ = options;
+}
+
+void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
+{
+    nstlist_ = nstlist;
+}
+
+void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    replicaExchangeParameters_ = params;
+}
+
+void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
+{
+    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+}
+
+Mdrunner Mdrunner::BuilderImplementation::build()
+{
+    auto newRunner = Mdrunner();
+
+    GMX_ASSERT(context_, "Bug found. It should not be possible to call build() without a valid context.");
+
+    newRunner.mdrunOptions    = mdrunOptions_;
+    newRunner.domdecOptions   = domdecOptions_;
+
+    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
+    newRunner.hw_opt          = hardwareOptions_;
+
+    // No invariant to check. This parameter exists to optionally override other behavior.
+    newRunner.nstlist_cmdline = nstlist_;
+
+    newRunner.replExParams    = replicaExchangeParameters_;
+
+    newRunner.filenames = filenames_;
+
+    GMX_ASSERT(context_->communicationRecord_, "SimulationContext communications not initialized.");
+    newRunner.cr = context_->communicationRecord_;
+
+    if (multisim_)
+    {
+        // nullptr is a valid value for the multisim handle, so we don't check the pointed-to pointer.
+        newRunner.ms = *multisim_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addMultiSim() is required before build()"));
+    }
+
+    // \todo Clarify ownership and lifetime management for gmx_output_env_t
+    // \todo Update sanity checking when output environment has clearly specified invariants.
+    // Initialization and default values for oenv are not well specified in the current version.
+    if (outputEnvironment_)
+    {
+        newRunner.oenv  = outputEnvironment_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
+    }
+
+    newRunner.logFileHandle = logFileHandle_;
+
+    if (nbpu_opt_)
+    {
+        newRunner.nbpu_opt    = nbpu_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
+    }
+
+    if (pme_opt_ && pme_fft_opt_)
+    {
+        newRunner.pme_opt     = pme_opt_;
+        newRunner.pme_fft_opt = pme_fft_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
+    }
+
+    if (bonded_opt_)
+    {
+        newRunner.bonded_opt = bonded_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
+    }
+
+    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+
+    if (stopHandlerBuilder_)
+    {
+        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
+    }
+    else
+    {
+        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+    }
+
+    return newRunner;
+}
+
+void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
+{
+    nbpu_opt_ = nbpu_opt;
+}
+
+void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
+                                             const char* pme_fft_opt)
+{
+    pme_opt_     = pme_opt;
+    pme_fft_opt_ = pme_fft_opt;
+}
+
+void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
+{
+    bonded_opt_ = bonded_opt;
+}
+
+void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    hardwareOptions_ = hardwareOptions;
+}
+
+void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    filenames_ = filenames;
+}
+
+void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    outputEnvironment_ = outputEnvironment;
+}
+
+void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
+{
+    logFileHandle_ = logFileHandle;
+}
+
+void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    stopHandlerBuilder_ = std::move(builder);
+}
+
+MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
+    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+{
+}
+
+MdrunnerBuilder::~MdrunnerBuilder() = default;
+
+MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    impl_->setExtraMdrunOptions(options, forceWarningThreshold);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
+{
+    impl_->addDomdec(options);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
+{
+    impl_->addVerletList(nstlist);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    impl_->addReplicaExchange(params);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addMultiSim(gmx_multisim_t* multisim)
+{
+    impl_->addMultiSim(multisim);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
+{
+    impl_->addNonBonded(nbpu_opt);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
+                                                    const char* pme_fft_opt)
+{
+    // The builder method may become more general in the future, but in this version,
+    // parameters for PME electrostatics are both required and the only parameters
+    // available.
+    if (pme_opt && pme_fft_opt)
+    {
+        impl_->addPME(pme_opt, pme_fft_opt);
+    }
+    else
+    {
+        GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
+    }
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
+{
+    impl_->addBondedTaskAssignment(bonded_opt);
+    return *this;
+}
+
+Mdrunner MdrunnerBuilder::build()
+{
+    return impl_->build();
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    impl_->addHardwareOptions(hardwareOptions);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    impl_->addFilenames(filenames);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    impl_->addOutputEnvironment(outputEnvironment);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
+{
+    impl_->addLogFile(logFileHandle);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    impl_->addStopHandlerBuilder(std::move(builder));
+    return *this;
+}
+
+MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
+
+MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
+
+} // namespace gmx
diff --git a/python/README.rst b/python/README.rst
index 30a0244b3a4bcfe30c3405e9004dd6e6b95008f9..0c47c75d2a0aed12ef3d5e406c2233f1e40cd332 100644
--- a/python/README.rst
+++ b/python/README.rst
@@ -1,18 +1,33 @@
 Python wrappers for plumed
 ==========================
 
-Install using the following commands::
+Install using the following command::
 
-     # install dependencies
-     python -m pip install numpy
-     # install plumed
      python -m pip install plumed
 
-WARNING: You will need to also build and install the plumed library (see http://www.plumed.org) and set the environment variable
-`PLUMED_KERNEL` to point to the file `libplumedKernel.so` (or `libplumedKernel.dylib`). Something like this should work::
+WARNING: You will need to also build and install the plumed library (see http://www.plumed.org) and make sure the file `libplumedKernel.so` (or `libplumedKernel.dylib`) is available on your system.
+
+You should then make sure the library is found setting the environment variable `PLUMED_KERNEL`::
 
      export PLUMED_KERNEL=/path/to/libplumedKernel.so
      python
      >>> import plumed
      >>> p=plumed.Plumed()
 
+If you manage multiple plumed versions on your system using tcl environment modules, this should be taken care automatically
+by the plumed module.
+
+Alternatively, a pure python solution is::
+
+    >>> import plumed
+    >>> os.environ["PLUMED_KERNEL"]="/path/to/libplumedKernel.so"
+    >>> p=plumed.Plumed()
+
+Finally, notice that you can set the path to the plumed library directly when declaring a Plumed object::
+
+    >>> import plumed
+    >>> p=plumed.Plumed(kernel="/path/to/libplumedKernel.so")
+
+This will allow you to mix different plumed versions in the same python script.
+
+CHANGES: See the PLUMED documentation.
diff --git a/python/plumed.pyx b/python/plumed.pyx
index 1a897863a3871a35949bfcb05a6784714e533e39..e717616360586feea4f87be3a6ca5a67ab7e6a67 100644
--- a/python/plumed.pyx
+++ b/python/plumed.pyx
@@ -25,8 +25,15 @@
 #
 
 cimport cplumed  # This imports information from pxd file - including contents of this file here causes name clashes
-import numpy as np
-cimport numpy as np
+
+from cpython cimport array
+import array
+
+try:
+     import numpy as np
+     HAS_NUMPY=True
+except ImportError:
+     HAS_NUMPY=False
 
 cdef class Plumed:
      cdef cplumed.Plumed c_plumed
@@ -59,8 +66,7 @@ cdef class Plumed:
          cdef bytes py_bytes = key.encode()
          cdef char* ckey = py_bytes
          cdef char* cval 
-         cdef np.int_t[:] ibuffer
-         cdef np.float64_t[:] dbuffer
+         cdef array.array ar
          if val is None :
             self.c_plumed.cmd( ckey, NULL )
          elif isinstance(val, (int,long) ):
@@ -71,13 +77,22 @@ cdef class Plumed:
             if key=="getBias" :
                raise ValueError("when using cmd with getBias option value must be a size one ndarray")
             self.cmd_float(ckey, val) 
-         elif isinstance(val, np.ndarray) : 
+         elif HAS_NUMPY and isinstance(val, np.ndarray) : 
             if( val.dtype=="float64" ):
                self.cmd_ndarray_real(ckey, val)
             elif( val.dtype=="int64" ) : 
                self.cmd_ndarray_int(ckey, val)
             else :
                raise ValueError("ndarrys should be float64 or int64")
+         elif isinstance(val, array.array) : 
+            if( (val.typecode=="d" or val.typecode=="f") and val.itemsize==8): 
+               ar = val
+               self.c_plumed.cmd( ckey, <void*> ar.data.as_voidptr)
+            elif( (val.typecode=="i" or val.typecode=="I") ) :
+               ar = val
+               self.c_plumed.cmd( ckey, <void*> ar.data.as_voidptr)
+            else :
+               raise ValueError("ndarrays should be double (size=8) or int")
          elif isinstance(val, basestring ) :
               py_bytes = val.encode()
               cval = py_bytes 
diff --git a/python/setup.py b/python/setup.py
index 3e9df6f9ab0a8ccb5a1504d47ac577d11538c075..45a824491fc1e2e5a3f34a74d8aed1d1998e373e 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -76,18 +76,11 @@ def readme():
     with open('README.rst') as f:
         return f.read()
 
-try:
-    import numpy
-except:
-    print('Error: building ' + plumedname + ' requires numpy. Please install it first with pip install numpy')
-    sys.exit(-1)
-
-include_dirs=[numpy.get_include()]
 
 try:
-    include_dirs.append(os.environ["plumed_include_dir"])
-except:
-    include_dirs.append(".")
+    include_dirs=[os.environ["plumed_include_dir"]]
+except KeyError:
+    include_dirs=["."]
 
 # allow one to force using cython with env var plumed_force_cython=yes
 USE_CYTHON = False
@@ -95,7 +88,7 @@ try:
     if(os.environ["plumed_force_cython"]=="yes"):
         print('plumed_force_cython=yes')
         USE_CYTHON = True
-except:
+except KeyError:
     pass
 
 # if plumed.cpp is available, do not need cython
@@ -110,7 +103,7 @@ if USE_CYTHON:
         print('importing cython')
         from Cython.Build import cythonize
         extension="pyx"
-    except:
+    except ImportError:
         print('Error: building ' + plumedname + ' requires cython. Please install it first with pip install cython')
         sys.exit(-1)
 else:
diff --git a/python/test/test_array.py b/python/test/test_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..39db39bd0dc4097242039026aafdebecb7c53d70
--- /dev/null
+++ b/python/test/test_array.py
@@ -0,0 +1,47 @@
+# simple test using raw arrays instead of numpy
+import plumed
+import os
+import array
+
+def test():
+
+    p = plumed.Plumed()
+    p.cmd("setNatoms",2)
+    p.cmd("setLogFile","test.log")
+    p.cmd("init")
+    p.cmd("readInputLine","d: DISTANCE ATOMS=1,2")
+    p.cmd("readInputLine","RESTRAINT ARG=d AT=0 KAPPA=1")
+
+    box=array.array('d',[10,0,0,0,10,0,0,0,10])
+    virial=array.array('d',[0,0,0,0,0,0,0,0,0])
+    masses=array.array('d',[1,1])
+    charges=array.array('d',[0,0])
+    forces=array.array('d',[0,0,0,0,0,0])
+    positions=array.array('d',[0,0,0,1,2,3])
+
+    p.cmd("setStep",0)
+    p.cmd("setBox",box )
+    p.cmd("setMasses", masses )
+    p.cmd("setCharges", charges )
+    p.cmd("setPositions", positions )
+    p.cmd("setForces", forces )
+    p.cmd("setVirial", virial )
+    p.cmd("calc")
+
+    bias=array.array('d',[0])
+
+    p.cmd("getBias", bias )
+
+    assert (bias[0] - 7.0)**2<1e-8
+    assert (forces[0] - 1.0)**2<1e-8
+    assert (forces[1] - 2.0)**2<1e-8
+    assert (forces[2] - 3.0)**2<1e-8
+    assert (forces[3] + 1.0)**2<1e-8
+    assert (forces[4] + 2.0)**2<1e-8
+    assert (forces[5] + 3.0)**2<1e-8
+
+    
+
+if __name__ == "__main__":
+    test()
+
diff --git a/regtest/basic/rt-close-structure/plumed.dat b/regtest/basic/rt-close-structure/plumed.dat
index 381ce05df6b631030bdd0e8ba9ad5c7d5154cc11..eaa25efa87d4cbb1d56b5e64b5622b6d759b9d5e 100644
--- a/regtest/basic/rt-close-structure/plumed.dat
+++ b/regtest/basic/rt-close-structure/plumed.dat
@@ -1,4 +1,5 @@
-p2: PROPERTYMAP REFERENCE=allv.pdb PROPERTY=X,Y LAMBDA=69087 EPSILON=0.001 LOG-CLOSE=1 DEBUG-CLOSE=1
+# LOG_CLOSE and DEBUG_CLOSE are synonym of CLOSE and DEBUG-CLOSE
+p2: PROPERTYMAP REFERENCE=allv.pdb PROPERTY=X,Y LAMBDA=69087 EPSILON=0.001 LOG_CLOSE=1 DEBUG_CLOSE=1
 p3: PROPERTYMAP REFERENCE=allv.pdb PROPERTY=X,Y LAMBDA=69087 NEIGH_SIZE=8 NEIGH_STRIDE=5 EPSILON=0.001 LOG-CLOSE=1 DEBUG-CLOSE=1
 PRINT ARG=p2.X,p2.Y,p2.zzz,p3.X,p3.Y,p3.zzz STRIDE=1 FILE=colvar FMT=%8.2f
 DUMPDERIVATIVES ARG=p2.X,p2.Y,p2.zzz,p3.X,p3.Y,p3.zzz STRIDE=1 FILE=deriv FMT=%8.2f
diff --git a/regtest/basic/rt-multirmsd/plumed.dat b/regtest/basic/rt-multirmsd/plumed.dat
index 57af8f2251587bba3c9db701b6b885683c54fbd2..ecd2962c4cdfadfb5b748f8eaa2cfa66bcf48b46 100644
--- a/regtest/basic/rt-multirmsd/plumed.dat
+++ b/regtest/basic/rt-multirmsd/plumed.dat
@@ -1,3 +1,4 @@
+# MULTI_RMSD is a synonym of MULTI-RMSD
 rmsd0:   MULTI-RMSD TYPE=MULTI-OPTIMAL NOPBC REFERENCE=test0.pdb
 rmsds1:  MULTI-RMSD TYPE=MULTI-SIMPLE  NOPBC REFERENCE=test1.pdb
 drmsd1:  MULTI-RMSD TYPE=MULTI-DRMSD REFERENCE=test2.pdb 
diff --git a/regtest/basic/rt66b/COLVAR.reference b/regtest/basic/rt66b/COLVAR.reference
index 8321445a4d2c8076a65c9ce24759d0ce6fb69457..f22e84821df25ecec278d16f5b9c5ca998e9044e 100644
--- a/regtest/basic/rt66b/COLVAR.reference
+++ b/regtest/basic/rt66b/COLVAR.reference
@@ -1,4 +1,4 @@
-#! FIELDS time t2 t3 t4 t4d t5 t6 t7 t8 t9 t10 t12 t13 t14
+#! FIELDS time t2 t3 t4 t4d t5 t6 t7 t8 t9 t10 t12 t13 t13b t13c t13d t13e t14
 #! SET min_t2 -pi
 #! SET max_t2 pi
 #! SET min_t3 -pi
@@ -23,6 +23,14 @@
 #! SET max_t12 pi
 #! SET min_t13 -pi
 #! SET max_t13 pi
+#! SET min_t13b -pi
+#! SET max_t13b pi
+#! SET min_t13c -pi
+#! SET max_t13c pi
+#! SET min_t13d -pi
+#! SET max_t13d pi
+#! SET min_t13e -pi
+#! SET max_t13e pi
 #! SET min_t14 -pi
 #! SET max_t14 pi
- 0.000000 -1.140  3.000  1.144  1.436 -2.773 -2.785 -1.264  0.044 -0.426  0.621  0.361 -0.867 -2.888
+ 0.000000 -1.140  3.000  1.144  1.436 -2.773 -2.785 -1.264  0.044 -0.426  0.621  0.361 -0.867  2.257 -0.479  0.621  3.123 -2.888
diff --git a/regtest/basic/rt66b/plumed.dat b/regtest/basic/rt66b/plumed.dat
index 8ec8b93ac6564b32aa0728a6c3dce2ed00d810df..685e697137a6a4e1f5acb9bb0604f7b84e825dae 100644
--- a/regtest/basic/rt66b/plumed.dat
+++ b/regtest/basic/rt66b/plumed.dat
@@ -14,11 +14,15 @@ t10: TORSION ATOMS=@v2-B_3
 t11: TORSION ATOMS=@v3-B_3
 t12: TORSION ATOMS=@v4-B_3
 t13: TORSION ATOMS=@phi-2
+t13b: TORSION ATOMS=@psi-2
+t13c: TORSION ATOMS=@chi1-2
+t13d: TORSION ATOMS=@chi2-2
+t13e: TORSION ATOMS=@omega-2
 # test chain_residue syntax where chain is a number
 t14: TORSION ATOMS=@chi-5_8
 
 c: CENTER ATOMS=@back-B4
 d: CENTER ATOMS=@base-B4
 
-PRINT ARG=t2,t3,t4,t4d,t5,t6,t7,t8,t9,t10,t12,t13,t14 FILE=COLVAR FMT=%6.3f
+PRINT ARG=t2,t3,t4,t4d,t5,t6,t7,t8,t9,t10,t12,t13,t13b,t13c,t13d,t13e,t14 FILE=COLVAR FMT=%6.3f
 
diff --git a/regtest/isdb/rt-noe-mpi/METAI.0.reference b/regtest/isdb/rt-noe-mpi/METAI.0.reference
new file mode 100644
index 0000000000000000000000000000000000000000..4a755579a967360dbd305d5e603f23e25ed5f739
--- /dev/null
+++ b/regtest/isdb/rt-noe-mpi/METAI.0.reference
@@ -0,0 +1,13 @@
+#! FIELDS time nost.weight b.bias
+ 0.000000 0.503976 1008.170680
+ 0.050000 0.503472 2571.457687
+ 0.100000 0.495150 3861.176722
+ 0.150000 0.497849 3825.190450
+ 0.200000 0.494064 513.396120
+ 0.250000 0.504276 2469.083398
+ 0.300000 0.491647 1310.994455
+ 0.350000 0.504603 1182.076834
+ 0.400000 0.510265 136.076666
+ 0.450000 0.493639 179.273856
+ 0.500000 0.513547 1254.024670
+ 0.550000 0.487511 1583.720852
diff --git a/regtest/isdb/rt-noe-mpi/METAI.1.reference b/regtest/isdb/rt-noe-mpi/METAI.1.reference
new file mode 100644
index 0000000000000000000000000000000000000000..5a8e856f161687e4ec5865913c6741ffa909f976
--- /dev/null
+++ b/regtest/isdb/rt-noe-mpi/METAI.1.reference
@@ -0,0 +1,13 @@
+#! FIELDS time nost.weight b.bias
+ 0.000000 0.496024 1008.170680
+ 0.050000 0.496528 2571.457687
+ 0.100000 0.504850 3861.176722
+ 0.150000 0.502151 3825.190450
+ 0.200000 0.505936 513.396120
+ 0.250000 0.495724 2469.083398
+ 0.300000 0.508353 1310.994455
+ 0.350000 0.495397 1182.076834
+ 0.400000 0.489735 136.076666
+ 0.450000 0.506361 179.273856
+ 0.500000 0.486453 1254.024670
+ 0.550000 0.512489 1583.720852
diff --git a/regtest/isdb/rt-noe-mpi/plumed.dat b/regtest/isdb/rt-noe-mpi/plumed.dat
index 774cd98f2085ef9fcbae1d9a1483f2e37cbc2def..03f030806e08696dbca7c6bb271910b700c92662 100644
--- a/regtest/isdb/rt-noe-mpi/plumed.dat
+++ b/regtest/isdb/rt-noe-mpi/plumed.dat
@@ -28,3 +28,23 @@ DUMPFORCES ARG=(prova\.noe_.*),(prova2\.noe_.*) STRIDE=1 FILE=forces
 
 
 PRINT ARG=inver1,inver2,test,(prova\.exp_.*),st1.*,st2.* FILE=COLVAR STRIDE=1
+
+d1: DISTANCE ATOMS=1,10
+d2: DISTANCE ATOMS=2,20
+cc: COMBINE ARG=d1,d2 PERIODIC=NO
+
+NOE ...
+GROUPA1=1,3 GROUPB1=20,20 NOEDIST1=10
+GROUPA2=3 GROUPB2=17 NOEDIST2=50
+DOSCORE
+SIGMA_MEAN0=1
+SIGMA0=0 DSIGMA=0
+TEMP=300
+ARG=cc
+REWEIGHT
+LABEL=nost
+... NOE
+
+b: BIASVALUE ARG=nost.score STRIDE=2
+
+PRINT ARG=nost.weight,b.bias STRIDE=2 FILE=METAI
diff --git a/regtest/multicolvar/rt-coordination-powers/derivatives.reference b/regtest/multicolvar/rt-coordination-powers/derivatives.reference
index 5c4c73cea81fa7347ec6155c6957d0c3f1685312..b11b9df8d68e66c188553105d0846adeb6e75270 100644
--- a/regtest/multicolvar/rt-coordination-powers/derivatives.reference
+++ b/regtest/multicolvar/rt-coordination-powers/derivatives.reference
@@ -1,196 +1,196 @@
 #! FIELDS time parameter c1.mean c1num.mean
- 0.000000 0   0.3371   0.3371
- 0.000000 1   0.2309   0.2309
- 0.000000 2   0.1160   0.1160
- 0.000000 3  -0.3510  -0.3510
- 0.000000 4   0.3400   0.3400
- 0.000000 5   0.0247   0.0247
- 0.000000 6  -0.2278  -0.2278
- 0.000000 7  -0.3331  -0.3331
- 0.000000 8   0.1718   0.1718
- 0.000000 9   0.3076   0.3076
- 0.000000 10  -0.3590  -0.3590
- 0.000000 11   0.0586   0.0586
- 0.000000 12   0.3427   0.3427
- 0.000000 13   0.3101   0.3101
- 0.000000 14  -0.0220  -0.0220
- 0.000000 15  -0.3300  -0.3300
- 0.000000 16   0.2127   0.2127
- 0.000000 17  -0.1563  -0.1563
- 0.000000 18  -0.3289  -0.3289
- 0.000000 19  -0.3330  -0.3330
- 0.000000 20  -0.0762  -0.0762
- 0.000000 21   0.2298   0.2298
- 0.000000 22  -0.3172  -0.3171
- 0.000000 23  -0.1589  -0.1589
- 0.000000 24   0.2379   0.2379
- 0.000000 25   0.1246   0.1246
- 0.000000 26  -0.0916  -0.0916
- 0.000000 27  -0.2175  -0.2175
- 0.000000 28   0.1238   0.1238
- 0.000000 29   0.1339   0.1339
- 0.000000 30   1.2686   1.2686
- 0.000000 31   0.0008   0.0008
- 0.000000 32  -0.0777  -0.0777
- 0.000000 33   0.0008   0.0008
- 0.000000 34   1.1233   1.1233
- 0.000000 35   0.0031   0.0031
- 0.000000 36  -0.0777  -0.0777
- 0.000000 37   0.0031   0.0031
- 0.000000 38   2.5446   2.5446
- 0.005000 0   0.3287   0.3287
- 0.005000 1   0.2323   0.2323
- 0.005000 2   0.1100   0.1100
- 0.005000 3  -0.3587  -0.3587
- 0.005000 4   0.3367   0.3367
- 0.005000 5   0.0181   0.0181
- 0.005000 6  -0.2232  -0.2232
- 0.005000 7  -0.3296  -0.3296
- 0.005000 8   0.1699   0.1699
- 0.005000 9   0.2908   0.2908
- 0.005000 10  -0.3585  -0.3585
- 0.005000 11   0.0690   0.0690
- 0.005000 12   0.3444   0.3444
- 0.005000 13   0.2895   0.2895
- 0.005000 14  -0.0237  -0.0237
- 0.005000 15  -0.3163  -0.3163
- 0.005000 16   0.1948   0.1948
- 0.005000 17  -0.1722  -0.1722
- 0.005000 18  -0.3261  -0.3261
- 0.005000 19  -0.3198  -0.3198
- 0.005000 20  -0.0941  -0.0941
- 0.005000 21   0.2273   0.2273
- 0.005000 22  -0.2878  -0.2878
- 0.005000 23  -0.1523  -0.1523
- 0.005000 24   0.2355   0.2355
- 0.005000 25   0.1149   0.1149
- 0.005000 26  -0.0652  -0.0652
- 0.005000 27  -0.2023  -0.2023
- 0.005000 28   0.1276   0.1276
- 0.005000 29   0.1406   0.1406
- 0.005000 30   1.2983   1.2983
- 0.005000 31   0.0074   0.0074
- 0.005000 32  -0.0442  -0.0442
- 0.005000 33   0.0074   0.0074
- 0.005000 34   1.0616   1.0616
- 0.005000 35   0.0049   0.0049
- 0.005000 36  -0.0442  -0.0442
- 0.005000 37   0.0049   0.0049
- 0.005000 38   2.4949   2.4949
- 0.010000 0   0.3057   0.3057
- 0.010000 1   0.2219   0.2219
- 0.010000 2   0.1123   0.1123
- 0.010000 3  -0.3610  -0.3610
- 0.010000 4   0.3207   0.3207
- 0.010000 5   0.0249   0.0249
- 0.010000 6  -0.2431  -0.2431
- 0.010000 7  -0.3227  -0.3227
- 0.010000 8   0.1861   0.1861
- 0.010000 9   0.3110   0.3110
- 0.010000 10  -0.3230  -0.3230
- 0.010000 11   0.0536   0.0536
- 0.010000 12   0.3321   0.3321
- 0.010000 13   0.2460   0.2460
- 0.010000 14  -0.0204  -0.0204
- 0.010000 15  -0.3035  -0.3035
- 0.010000 16   0.1750   0.1750
- 0.010000 17  -0.1895  -0.1895
- 0.010000 18  -0.3215  -0.3215
- 0.010000 19  -0.3011  -0.3011
- 0.010000 20  -0.0991  -0.0991
- 0.010000 21   0.2342   0.2342
- 0.010000 22  -0.2677  -0.2677
- 0.010000 23  -0.1512  -0.1512
- 0.010000 24   0.2325   0.2325
- 0.010000 25   0.1200   0.1200
- 0.010000 26  -0.0458  -0.0458
- 0.010000 27  -0.1864  -0.1864
- 0.010000 28   0.1309   0.1309
- 0.010000 29   0.1290   0.1290
- 0.010000 30   1.3546   1.3546
- 0.010000 31  -0.0010  -0.0010
- 0.010000 32  -0.0324  -0.0324
- 0.010000 33  -0.0010  -0.0010
- 0.010000 34   0.9469   0.9469
- 0.010000 35   0.0118   0.0118
- 0.010000 36  -0.0324  -0.0324
- 0.010000 37   0.0118   0.0118
- 0.010000 38   2.4485   2.4485
- 0.015000 0   0.2784   0.2784
- 0.015000 1   0.2040   0.2040
- 0.015000 2   0.0909   0.0909
- 0.015000 3  -0.3455  -0.3455
- 0.015000 4   0.2867   0.2867
- 0.015000 5   0.0230   0.0230
- 0.015000 6  -0.2596  -0.2596
- 0.015000 7  -0.3044  -0.3044
- 0.015000 8   0.2036   0.2036
- 0.015000 9   0.3216   0.3216
- 0.015000 10  -0.2752  -0.2752
- 0.015000 11   0.0411   0.0411
- 0.015000 12   0.3142   0.3142
- 0.015000 13   0.1924   0.1924
- 0.015000 14  -0.0163  -0.0163
- 0.015000 15  -0.2939  -0.2939
- 0.015000 16   0.1585   0.1585
- 0.015000 17  -0.2068  -0.2068
- 0.015000 18  -0.3090  -0.3090
- 0.015000 19  -0.2674  -0.2674
- 0.015000 20  -0.0777  -0.0777
- 0.015000 21   0.2317   0.2317
- 0.015000 22  -0.2560  -0.2560
- 0.015000 23  -0.1327  -0.1327
- 0.015000 24   0.2310   0.2310
- 0.015000 25   0.1252   0.1252
- 0.015000 26  -0.0350  -0.0350
- 0.015000 27  -0.1689  -0.1689
- 0.015000 28   0.1363   0.1363
- 0.015000 29   0.1099   0.1099
- 0.015000 30   1.3937   1.3937
- 0.015000 31  -0.0040  -0.0040
- 0.015000 32  -0.0365  -0.0365
- 0.015000 33  -0.0040  -0.0040
- 0.015000 34   0.8269   0.8269
- 0.015000 35  -0.0103  -0.0103
- 0.015000 36  -0.0365  -0.0365
- 0.015000 37  -0.0103  -0.0103
- 0.015000 38   2.3510   2.3510
- 0.020000 0   0.2666   0.2666
- 0.020000 1   0.1869   0.1869
- 0.020000 2   0.0703   0.0703
- 0.020000 3  -0.3255  -0.3255
- 0.020000 4   0.2692   0.2692
- 0.020000 5   0.0102   0.0102
- 0.020000 6  -0.2645  -0.2645
- 0.020000 7  -0.2835  -0.2835
- 0.020000 8   0.2141   0.2141
- 0.020000 9   0.3166   0.3166
- 0.020000 10  -0.2635  -0.2635
- 0.020000 11   0.0653   0.0653
- 0.020000 12   0.3024   0.3024
- 0.020000 13   0.1976   0.1976
- 0.020000 14  -0.0355  -0.0355
- 0.020000 15  -0.2931  -0.2931
- 0.020000 16   0.1405   0.1405
- 0.020000 17  -0.2155  -0.2155
- 0.020000 18  -0.2977  -0.2977
- 0.020000 19  -0.2589  -0.2589
- 0.020000 20  -0.0579  -0.0579
- 0.020000 21   0.2223   0.2223
- 0.020000 22  -0.2595  -0.2595
- 0.020000 23  -0.1251  -0.1251
- 0.020000 24   0.2263   0.2263
- 0.020000 25   0.1348   0.1348
- 0.020000 26  -0.0242  -0.0242
- 0.020000 27  -0.1535  -0.1535
- 0.020000 28   0.1365   0.1365
- 0.020000 29   0.0982   0.0982
- 0.020000 30   1.3942   1.3942
- 0.020000 31   0.0116   0.0116
- 0.020000 32  -0.0319  -0.0319
- 0.020000 33   0.0116   0.0116
- 0.020000 34   0.8066   0.8066
- 0.020000 35  -0.0388  -0.0388
- 0.020000 36  -0.0319  -0.0319
- 0.020000 37  -0.0388  -0.0388
- 0.020000 38   2.2840   2.2840
+ 0.000000 0   0.2576   0.2576
+ 0.000000 1   0.1784   0.1784
+ 0.000000 2   0.1148   0.1148
+ 0.000000 3  -0.2848  -0.2848
+ 0.000000 4   0.2573   0.2573
+ 0.000000 5   0.0311   0.0311
+ 0.000000 6  -0.1602  -0.1602
+ 0.000000 7  -0.2576  -0.2576
+ 0.000000 8   0.1443   0.1443
+ 0.000000 9   0.2195   0.2195
+ 0.000000 10  -0.2827  -0.2827
+ 0.000000 11   0.0801   0.0801
+ 0.000000 12   0.2542   0.2542
+ 0.000000 13   0.2204   0.2204
+ 0.000000 14  -0.0389  -0.0389
+ 0.000000 15  -0.2562  -0.2562
+ 0.000000 16   0.1441   0.1441
+ 0.000000 17  -0.1263  -0.1263
+ 0.000000 18  -0.2496  -0.2496
+ 0.000000 19  -0.2380  -0.2380
+ 0.000000 20  -0.0968  -0.0968
+ 0.000000 21   0.1800   0.1800
+ 0.000000 22  -0.2248  -0.2248
+ 0.000000 23  -0.1502  -0.1502
+ 0.000000 24   0.2051   0.2051
+ 0.000000 25   0.0951   0.0951
+ 0.000000 26  -0.0616  -0.0616
+ 0.000000 27  -0.1656  -0.1656
+ 0.000000 28   0.1078   0.1078
+ 0.000000 29   0.1035   0.1035
+ 0.000000 30   0.9771   0.9771
+ 0.000000 31  -0.0025  -0.0025
+ 0.000000 32  -0.0434  -0.0434
+ 0.000000 33  -0.0025  -0.0025
+ 0.000000 34   0.8416   0.8416
+ 0.000000 35   0.0170   0.0170
+ 0.000000 36  -0.0434  -0.0434
+ 0.000000 37   0.0170   0.0170
+ 0.000000 38   2.5110   2.5110
+ 0.005000 0   0.2556   0.2556
+ 0.005000 1   0.1841   0.1841
+ 0.005000 2   0.1225   0.1225
+ 0.005000 3  -0.3096  -0.3096
+ 0.005000 4   0.2536   0.2536
+ 0.005000 5   0.0116   0.0116
+ 0.005000 6  -0.1518  -0.1518
+ 0.005000 7  -0.2632  -0.2632
+ 0.005000 8   0.1297   0.1297
+ 0.005000 9   0.2044   0.2044
+ 0.005000 10  -0.2823  -0.2823
+ 0.005000 11   0.0947   0.0947
+ 0.005000 12   0.2620   0.2620
+ 0.005000 13   0.1952   0.1952
+ 0.005000 14  -0.0426  -0.0426
+ 0.005000 15  -0.2497  -0.2497
+ 0.005000 16   0.1184   0.1184
+ 0.005000 17  -0.1320  -0.1320
+ 0.005000 18  -0.2566  -0.2566
+ 0.005000 19  -0.2169  -0.2169
+ 0.005000 20  -0.1139  -0.1139
+ 0.005000 21   0.1797   0.1797
+ 0.005000 22  -0.1888  -0.1888
+ 0.005000 23  -0.1530  -0.1530
+ 0.005000 24   0.2132   0.2132
+ 0.005000 25   0.0817   0.0817
+ 0.005000 26  -0.0227  -0.0227
+ 0.005000 27  -0.1472  -0.1472
+ 0.005000 28   0.1180   0.1180
+ 0.005000 29   0.1056   0.1056
+ 0.005000 30   1.0253   1.0253
+ 0.005000 31   0.0065   0.0065
+ 0.005000 32   0.0105   0.0105
+ 0.005000 33   0.0065   0.0065
+ 0.005000 34   0.7862   0.7862
+ 0.005000 35   0.0302   0.0302
+ 0.005000 36   0.0105   0.0105
+ 0.005000 37   0.0302   0.0302
+ 0.005000 38   2.4682   2.4682
+ 0.010000 0   0.2377   0.2377
+ 0.010000 1   0.1684   0.1684
+ 0.010000 2   0.1429   0.1429
+ 0.010000 3  -0.3273  -0.3273
+ 0.010000 4   0.2339   0.2339
+ 0.010000 5   0.0073   0.0073
+ 0.010000 6  -0.1809  -0.1809
+ 0.010000 7  -0.2617  -0.2617
+ 0.010000 8   0.1499   0.1499
+ 0.010000 9   0.2401   0.2401
+ 0.010000 10  -0.2325  -0.2325
+ 0.010000 11   0.0678   0.0678
+ 0.010000 12   0.2520   0.2520
+ 0.010000 13   0.1286   0.1286
+ 0.010000 14  -0.0378  -0.0378
+ 0.010000 15  -0.2463  -0.2463
+ 0.010000 16   0.0987   0.0987
+ 0.010000 17  -0.1384  -0.1384
+ 0.010000 18  -0.2558  -0.2558
+ 0.010000 19  -0.1817  -0.1817
+ 0.010000 20  -0.1166  -0.1166
+ 0.010000 21   0.1873   0.1873
+ 0.010000 22  -0.1756  -0.1756
+ 0.010000 23  -0.1491  -0.1491
+ 0.010000 24   0.2139   0.2139
+ 0.010000 25   0.0968   0.0968
+ 0.010000 26  -0.0203  -0.0203
+ 0.010000 27  -0.1205  -0.1205
+ 0.010000 28   0.1250   0.1250
+ 0.010000 29   0.0945   0.0945
+ 0.010000 30   1.1005   1.1005
+ 0.010000 31  -0.0026  -0.0026
+ 0.010000 32   0.0195   0.0195
+ 0.010000 33  -0.0026  -0.0026
+ 0.010000 34   0.6764   0.6764
+ 0.010000 35   0.0390   0.0390
+ 0.010000 36   0.0195   0.0195
+ 0.010000 37   0.0390   0.0390
+ 0.010000 38   2.4056   2.4056
+ 0.015000 0   0.2201   0.2201
+ 0.015000 1   0.1468   0.1468
+ 0.015000 2   0.1232   0.1232
+ 0.015000 3  -0.3255  -0.3255
+ 0.015000 4   0.1931   0.1931
+ 0.015000 5  -0.0015  -0.0015
+ 0.015000 6  -0.2162  -0.2162
+ 0.015000 7  -0.2551  -0.2551
+ 0.015000 8   0.1767   0.1767
+ 0.015000 9   0.2721   0.2721
+ 0.015000 10  -0.1773  -0.1773
+ 0.015000 11   0.0444   0.0444
+ 0.015000 12   0.2404   0.2404
+ 0.015000 13   0.0659   0.0659
+ 0.015000 14  -0.0348  -0.0348
+ 0.015000 15  -0.2492  -0.2492
+ 0.015000 16   0.0911   0.0911
+ 0.015000 17  -0.1519  -0.1519
+ 0.015000 18  -0.2516  -0.2516
+ 0.015000 19  -0.1356  -0.1356
+ 0.015000 20  -0.0847  -0.0847
+ 0.015000 21   0.1922   0.1922
+ 0.015000 22  -0.1733  -0.1733
+ 0.015000 23  -0.1152  -0.1152
+ 0.015000 24   0.2141   0.2141
+ 0.015000 25   0.1112   0.1112
+ 0.015000 26  -0.0339  -0.0339
+ 0.015000 27  -0.0963  -0.0963
+ 0.015000 28   0.1333   0.1333
+ 0.015000 29   0.0776   0.0776
+ 0.015000 30   1.1799   1.1799
+ 0.015000 31  -0.0010  -0.0010
+ 0.015000 32  -0.0029  -0.0029
+ 0.015000 33  -0.0010  -0.0010
+ 0.015000 34   0.5770   0.5770
+ 0.015000 35  -0.0037  -0.0037
+ 0.015000 36  -0.0029  -0.0029
+ 0.015000 37  -0.0037  -0.0037
+ 0.015000 38   2.2992   2.2992
+ 0.020000 0   0.2204   0.2204
+ 0.020000 1   0.1243   0.1243
+ 0.020000 2   0.0996   0.0996
+ 0.020000 3  -0.3153  -0.3153
+ 0.020000 4   0.1757   0.1757
+ 0.020000 5  -0.0146  -0.0146
+ 0.020000 6  -0.2366  -0.2366
+ 0.020000 7  -0.2451  -0.2451
+ 0.020000 8   0.1915   0.1915
+ 0.020000 9   0.2832   0.2832
+ 0.020000 10  -0.1666  -0.1666
+ 0.020000 11   0.0758   0.0758
+ 0.020000 12   0.2393   0.2393
+ 0.020000 13   0.0789   0.0789
+ 0.020000 14  -0.0617  -0.0617
+ 0.020000 15  -0.2540  -0.2540
+ 0.020000 16   0.0820   0.0820
+ 0.020000 17  -0.1645  -0.1645
+ 0.020000 18  -0.2546  -0.2546
+ 0.020000 19  -0.1308  -0.1308
+ 0.020000 20  -0.0623  -0.0623
+ 0.020000 21   0.1886   0.1886
+ 0.020000 22  -0.1836  -0.1836
+ 0.020000 23  -0.1014  -0.1014
+ 0.020000 24   0.2077   0.2077
+ 0.020000 25   0.1270   0.1270
+ 0.020000 26  -0.0371  -0.0371
+ 0.020000 27  -0.0787  -0.0787
+ 0.020000 28   0.1383   0.1383
+ 0.020000 29   0.0747   0.0747
+ 0.020000 30   1.2233   1.2233
+ 0.020000 31   0.0214   0.0214
+ 0.020000 32  -0.0092  -0.0092
+ 0.020000 33   0.0214   0.0214
+ 0.020000 34   0.5735   0.5735
+ 0.020000 35  -0.0503  -0.0503
+ 0.020000 36  -0.0092  -0.0092
+ 0.020000 37  -0.0503  -0.0503
+ 0.020000 38   2.2551   2.2551
diff --git a/regtest/multicolvar/rt-coordination-powers/plumed.dat b/regtest/multicolvar/rt-coordination-powers/plumed.dat
index ec3e4821ca5de2b98285b66e9ea0dad56959f3c9..74f3664294456c84bafa9c9fff4c13cba25a89f8 100644
--- a/regtest/multicolvar/rt-coordination-powers/plumed.dat
+++ b/regtest/multicolvar/rt-coordination-powers/plumed.dat
@@ -1,3 +1,3 @@
-COORDINATIONNUMBER SPECIES=1-10 R_0=1 R_POWER=2 LABEL=c1 MEAN
-COORDINATIONNUMBER SPECIES=1-10 R_0=1 NUMERICAL_DERIVATIVES R_POWER=2 LABEL=c1num MEAN 
+COORDINATIONNUMBER SPECIES=1-10 R_0=1 R_POWER=3 LABEL=c1 MEAN
+COORDINATIONNUMBER SPECIES=1-10 R_0=1 NUMERICAL_DERIVATIVES R_POWER=3 LABEL=c1num MEAN 
 DUMPDERIVATIVES ARG=c1.*,c1num.* STRIDE=1 FILE=derivatives FMT=%8.4f
diff --git a/src/adjmat/MatrixColumnSums.cpp b/src/adjmat/MatrixColumnSums.cpp
index a294cad9c89a2730176ada62c1a11dcfe68be507..4a7d62dc5bc1ce8fb568f1ec8dc0e73b7722ae19 100644
--- a/src/adjmat/MatrixColumnSums.cpp
+++ b/src/adjmat/MatrixColumnSums.cpp
@@ -76,7 +76,7 @@ PLUMED_REGISTER_ACTION(MatrixColumnSums,"COLUMNSUMS")
 
 void MatrixColumnSums::registerKeywords( Keywords& keys ) {
   ActionWithInputMatrix::registerKeywords( keys );
-  keys.use("ALT_MIN"); keys.use("LOWEST"); keys.use("HIGHEST"); keys.use("MEAN");
+  keys.use("ALT_MIN"); keys.use("LOWEST"); keys.use("HIGHEST");
   keys.use("MEAN"); keys.use("MIN"); keys.use("MAX"); keys.use("LESS_THAN");
   keys.use("MORE_THAN"); keys.use("BETWEEN"); keys.use("HISTOGRAM"); keys.use("MOMENTS");
 }
diff --git a/src/adjmat/MatrixRowSums.cpp b/src/adjmat/MatrixRowSums.cpp
index a85f36f2f245d76d3a0c75f9657c025893904b2f..fb60b8b31e25124e906fa25381d41e4412298963 100644
--- a/src/adjmat/MatrixRowSums.cpp
+++ b/src/adjmat/MatrixRowSums.cpp
@@ -76,7 +76,7 @@ PLUMED_REGISTER_ACTION(MatrixRowSums,"ROWSUMS")
 
 void MatrixRowSums::registerKeywords( Keywords& keys ) {
   ActionWithInputMatrix::registerKeywords( keys );
-  keys.use("ALT_MIN"); keys.use("LOWEST"); keys.use("HIGHEST"); keys.use("MEAN");
+  keys.use("ALT_MIN"); keys.use("LOWEST"); keys.use("HIGHEST");
   keys.use("MEAN"); keys.use("MIN"); keys.use("MAX"); keys.use("LESS_THAN");
   keys.use("MORE_THAN"); keys.use("BETWEEN"); keys.use("HISTOGRAM"); keys.use("MOMENTS");
 }
diff --git a/src/analysis/ReadAnalysisFrames.cpp b/src/analysis/ReadAnalysisFrames.cpp
index 46eb1382d6fd96abe3227cdf38f6668ffcbced48..778fd1ebc101f04035d4bd39523a5cbac53b31e2 100644
--- a/src/analysis/ReadAnalysisFrames.cpp
+++ b/src/analysis/ReadAnalysisFrames.cpp
@@ -55,7 +55,7 @@ ReadAnalysisFrames::ReadAnalysisFrames( const ActionOptions& ao ):
   weights_calculated(false)
 {
   parse("CLEAR",clearstride);
-  if( clearstride!=0 ) log.printf("  clearing stored data every %d steps\n");
+  if( clearstride!=0 ) log.printf("  clearing stored data every %u steps\n",clearstride);
   // Get the names of the argumes
   argument_names.resize( getNumberOfArguments() );
   for(unsigned i=0; i<getNumberOfArguments(); ++i) argument_names[i]=getPntrToArgument(i)->getName();
diff --git a/src/bias/MetaD.cpp b/src/bias/MetaD.cpp
index 42e66fbd3f8b741d22b8cec8c8b94e3ffee927d2..493744f377f266147b4a7bece45ab89e0b5b0039 100644
--- a/src/bias/MetaD.cpp
+++ b/src/bias/MetaD.cpp
@@ -1020,7 +1020,7 @@ MetaD::MetaD(const ActionOptions& ao):
     log.printf("  calculation on the fly of the transition bias V*(t)\n");
     addComponent("transbias");
     componentIsNotPeriodic("transbias");
-    log.printf("  Number of transition wells %d\n", transitionwells_.size());
+    log<<"  Number of transition wells "<<transitionwells_.size()<<"\n";
     if (transitionwells_.size() == 0) error("Calculating the transition bias on the fly requires definition of at least one transition well");
     // Check that a grid is in use.
     if (!grid_) error(" transition barrier finding requires a grid for the bias");
diff --git a/src/cltools/Driver.cpp b/src/cltools/Driver.cpp
index 468d2c21da67c459d3930ec5af47001ba9a83b55..be1d69a66c04b75c8c459db66191b141ada57417 100644
--- a/src/cltools/Driver.cpp
+++ b/src/cltools/Driver.cpp
@@ -221,6 +221,7 @@ void Driver<real>::registerKeywords( Keywords& keys ) {
           );
   keys.add("compulsory","--multi","0","set number of replicas for multi environment (needs MPI)");
   keys.addFlag("--noatoms",false,"don't read in a trajectory.  Just use colvar files as specified in plumed.dat");
+  keys.addFlag("--parse-only",false,"read the plumed input file and stop");
   keys.add("atoms","--ixyz","the trajectory in xyz format");
   keys.add("atoms","--igro","the trajectory in gro format");
 #ifdef __PLUMED_HAS_XDRFILE
@@ -252,7 +253,6 @@ void Driver<real>::registerKeywords( Keywords& keys ) {
   for(unsigned i=0; i<plugins.size(); i++) {
     string kk="--mf_"+string(plugins[i]->name);
     string mm=" molfile: the trajectory in "+string(plugins[i]->name)+" format " ;
-    //cerr<<"REGISTERING "<<kk<<mm<<endl;
     keys.add("atoms",kk,mm);
   }
 #endif
@@ -285,6 +285,7 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
   }
   // Are we reading trajectory data
   bool noatoms; parseFlag("--noatoms",noatoms);
+  bool parseOnly; parseFlag("--parse-only",parseOnly);
 
   std::string fakein;
   bool debug_float=false;
@@ -447,7 +448,7 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
       trajectoryFile=traj_trr;
       trajectory_fmt="xdr-trr";
     }
-    if(trajectoryFile.length()==0) {
+    if(trajectoryFile.length()==0&&!parseOnly) {
       fprintf(stderr,"ERROR: missing trajectory data\n");
       if(grex_log)fclose(grex_log);
       return 1;
@@ -518,11 +519,17 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
 
   int natoms;
 
+  if(parseOnly) {
+    if(command_line_natoms<0) error("--parseOnly requires setting the number of atoms with --natoms");
+    natoms=command_line_natoms;
+  }
+
+
   FILE* fp=NULL; FILE* fp_forces=NULL; OFile fp_dforces;
 #ifdef __PLUMED_HAS_XDRFILE
   XDRFILE* xd=NULL;
 #endif
-  if(!noatoms) {
+  if(!noatoms&&!parseOnly) {
     if (trajectoryFile=="-")
       fp=in;
     else {
@@ -605,16 +612,12 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
   Random rnd;
 
   while(true) {
-    if(!noatoms) {
+    if(!noatoms&&!parseOnly) {
       if(use_molfile==true) {
 #ifdef __PLUMED_HAS_MOLFILE_PLUGINS
         int rc;
         rc = api->read_next_timestep(h_in, natoms, &ts_in);
-        //if(rc==MOLFILE_SUCCESS){
-        //       printf(" read this one :success \n");
-        //}
         if(rc==MOLFILE_EOF) {
-          //printf(" read this one :eof or error \n");
           break;
         }
 #endif
@@ -624,7 +627,7 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
     }
 
     bool first_step=false;
-    if(!noatoms) {
+    if(!noatoms&&!parseOnly) {
       if(use_molfile==false && (trajectory_fmt=="xyz" || trajectory_fmt=="gro")) {
         if(trajectory_fmt=="gro") if(!Tools::getline(fp,line)) error("premature end of trajectory file");
         sscanf(line.c_str(),"%100d",&natoms);
@@ -665,6 +668,7 @@ int Driver<real>::main(FILE* in,FILE*out,Communicator& pc) {
       checknatoms=natoms;
       p.cmd("setNatoms",&natoms);
       p.cmd("init");
+      if(parseOnly) break;
     }
     if(checknatoms!=natoms) {
       std::string stepstr; Tools::convert(step,stepstr);
diff --git a/src/cltools/SimpleMD.cpp b/src/cltools/SimpleMD.cpp
index 5978c54eff02c1673426d8c196e640020ba556af..c3e49685e76c214cacfc9207ff925c19b30ed29e 100644
--- a/src/cltools/SimpleMD.cpp
+++ b/src/cltools/SimpleMD.cpp
@@ -603,6 +603,9 @@ private:
 
     }
 
+// call final plumed jobs
+    plumed->cmd("runFinalJobs");
+
 // write final positions
     write_final_positions(outputfile,natoms,positions,cell,wrapatoms);
 
diff --git a/src/colvar/MultiRMSD.cpp b/src/colvar/MultiRMSD.cpp
index 9ef3d7e68d724e749f994597f4263d8a07101ae1..8984c85014e0fd0ab81d9d990f1ba2598f92d815 100644
--- a/src/colvar/MultiRMSD.cpp
+++ b/src/colvar/MultiRMSD.cpp
@@ -149,6 +149,28 @@ END
 
 PLUMED_REGISTER_ACTION(MultiRMSD,"MULTI-RMSD")
 
+//+PLUMEDOC DCOLVAR MULTI_RMSD
+/*
+An alias to the \ref MULTI-RMSD function.
+
+\par Examples
+
+Just replace \ref MULTI-RMSD with \ref MULTI_RMSD
+
+\plumedfile
+MULTI_RMSD REFERENCE=file.pdb TYPE=MULTI-DRMSD
+\endplumedfile
+
+*/
+//+ENDPLUMEDOC
+
+class Multi_RMSD :
+  public MultiRMSD {
+};
+
+PLUMED_REGISTER_ACTION(MultiRMSD,"MULTI_RMSD")
+
+
 void MultiRMSD::registerKeywords(Keywords& keys) {
   Colvar::registerKeywords(keys);
   keys.add("compulsory","REFERENCE","a file in pdb format containing the reference structure and the atoms involved in the CV.");
diff --git a/src/colvar/PCARMSD.cpp b/src/colvar/PCARMSD.cpp
index 7a7d46453615497dadb2cd25c7eec0bb94ccd81e..13b5e7671956476b7812893e12115656da2c00e0 100644
--- a/src/colvar/PCARMSD.cpp
+++ b/src/colvar/PCARMSD.cpp
@@ -77,6 +77,7 @@ void PCARMSD::registerKeywords(Keywords& keys) {
   keys.addOutputComponent("eig","default","the projections on each eigenvalue are stored on values labeled eig-1, eig-2, ...");
   keys.addOutputComponent("residual","default","the distance of the present configuration from the configuration supplied as AVERAGE in terms of mean squared displacement after optimal alignment ");
   keys.addFlag("SQUARED-ROOT",false," This should be set if you want RMSD instead of mean squared displacement ");
+  keys.addFlag("SQUARED_ROOT",false," Same as SQUARED-ROOT");
 }
 
 PCARMSD::PCARMSD(const ActionOptions&ao):
@@ -91,6 +92,7 @@ PCARMSD::PCARMSD(const ActionOptions&ao):
   string f_eigenvectors;
   parse("EIGENVECTORS",f_eigenvectors);
   bool sq;  parseFlag("SQUARED-ROOT",sq);
+  if(!sq) parseFlag("SQUARED_ROOT",sq);
   if (sq) { squared=false; }
   parseFlag("NOPBC",nopbc);
   checkRead();
diff --git a/src/colvar/PathMSDBase.cpp b/src/colvar/PathMSDBase.cpp
index 7aa186c57c41f3c80fc35759600b5f9d1485b2ae..ccd9fb4c8a0953a3397d70b5f5dbabf7ac1ffce1 100644
--- a/src/colvar/PathMSDBase.cpp
+++ b/src/colvar/PathMSDBase.cpp
@@ -43,6 +43,8 @@ void PathMSDBase::registerKeywords(Keywords& keys) {
   keys.add("optional", "EPSILON", "(default=-1) the maximum distance between the close and the current structure, the positive value turn on the close structure method");
   keys.add("optional", "LOG-CLOSE", "(default=0) value 1 enables logging regarding the close structure");
   keys.add("optional", "DEBUG-CLOSE", "(default=0) value 1 enables extensive debugging info regarding the close structure, the simulation will run much slower");
+  keys.add("optional", "LOG_CLOSE", "same as LOG-CLOSE");
+  keys.add("optional", "DEBUG_CLOSE", "same as DEBUG-CLOSE");
 }
 
 PathMSDBase::PathMSDBase(const ActionOptions&ao):
@@ -62,10 +64,11 @@ PathMSDBase::PathMSDBase(const ActionOptions&ao):
   parse("REFERENCE",reference);
   parse("EPSILON", epsilonClose);
   parse("LOG-CLOSE", logClose);
+  if(!logClose) parse("LOG_CLOSE", logClose);
   parse("DEBUG-CLOSE", debugClose);
+  if(!debugClose) parse("DEBUG_CLOSE",debugClose);
   parseFlag("NOPBC",nopbc);
 
-
   // open the file
   FILE* fp=fopen(reference.c_str(),"r");
   std::vector<AtomNumber> aaa;
@@ -220,7 +223,8 @@ void PathMSDBase::calculate() {
           vector<Vector> ders;
           double withoutclose = opt.calculate(getPositions(), ders, true);
           float difference = fabs(withoutclose-withclose);
-          log.printf("PLUMED-CLOSE: difference original %lf - with close %lf = %lf, step %d, i %d imgVec[i].index %d \n", withoutclose, withclose, difference, getStep(), i, imgVec[i].index);
+          log<<"PLUMED-CLOSE: difference original "<<withoutclose;
+          log<<" - with close "<<withclose<<" = "<<difference<<", step "<<getStep()<<", i "<<i<<" imgVec[i].index "<<imgVec[i].index<<"\n";
         }
       }
     }
diff --git a/src/core/ActionAtomistic.cpp b/src/core/ActionAtomistic.cpp
index f3e27a7cbc74dfb29cd2fcb1a503bf51a8f1e810..163fb85f9556bd0ab1e3bf3829eb535298567f65 100644
--- a/src/core/ActionAtomistic.cpp
+++ b/src/core/ActionAtomistic.cpp
@@ -58,7 +58,7 @@ void ActionAtomistic::registerKeywords( Keywords& keys ) {
 }
 
 
-void ActionAtomistic::requestAtoms(const vector<AtomNumber> & a) {
+void ActionAtomistic::requestAtoms(const vector<AtomNumber> & a, const bool clearDep) {
   plumed_massert(!lockRequestAtoms,"requested atom list can only be changed in the prepare() method");
   int nat=a.size();
   indexes=a;
@@ -67,7 +67,7 @@ void ActionAtomistic::requestAtoms(const vector<AtomNumber> & a) {
   masses.resize(nat);
   charges.resize(nat);
   int n=atoms.positions.size();
-  clearDependencies();
+  if(clearDep) clearDependencies();
   unique.clear();
   for(unsigned i=0; i<indexes.size(); i++) {
     if(indexes[i].index()>=n) error("atom out of range");
diff --git a/src/core/ActionAtomistic.h b/src/core/ActionAtomistic.h
index 17066cf4e6b94aaef263db34fc8cd2c621c4544c..a7c28928252bfccf1c34c28e9f7e076552b3a9cb 100644
--- a/src/core/ActionAtomistic.h
+++ b/src/core/ActionAtomistic.h
@@ -80,7 +80,7 @@ public:
 /// during the simulation, atoms will be available at the next step
 /// MAYBE WE HAVE TO FIND SOMETHING MORE CLEAR FOR DYNAMIC
 /// LISTS OF ATOMS
-  void requestAtoms(const std::vector<AtomNumber> & a);
+  void requestAtoms(const std::vector<AtomNumber> & a, const bool clearDep=true);
 /// Get position of i-th atom (access by relative index)
   const Vector & getPosition(int)const;
 /// Get position of i-th atom (access by absolute AtomNumber).
diff --git a/src/core/PlumedMain.cpp b/src/core/PlumedMain.cpp
index d5e15c80da9931ead7be0726ffc5cc936348e815..773ab070ecb88fc0a53b99430b8a3356e4278029 100644
--- a/src/core/PlumedMain.cpp
+++ b/src/core/PlumedMain.cpp
@@ -457,7 +457,7 @@ void PlumedMain::cmd(const std::string & word,void*val) {
       /* ADDED WITH API==6 */
       case cmd_setNumOMPthreads:
         CHECK_NOTNULL(val,word);
-        OpenMP::setNumThreads(*static_cast<unsigned*>(val));
+        OpenMP::setNumThreads(*static_cast<int*>(val)>0?*static_cast<int*>(val):1);
         break;
       /* ADDED WITH API==6 */
       /* only used for testing */
diff --git a/src/crystallization/Q3.cpp b/src/crystallization/Q3.cpp
index 3d40f6173d3e91af36364807be0de9dc2ac87624..0e182fb6d20aff55473fa19f90042e7cdd8856b9 100644
--- a/src/crystallization/Q3.cpp
+++ b/src/crystallization/Q3.cpp
@@ -88,8 +88,7 @@ PRINT ARG=q3.mean FILE=colvar
 
 //+PLUMEDOC MCOLVARF LOCAL_Q3
 /*
-Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_3\f$ vector on the central atom and the \f$q_3\f$ vector
-on the atoms in the first coordination sphere.
+Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_3\f$ vector on the central atom and the \f$q_3\f$ vector on the atoms in the first coordination sphere.
 
 The \ref Q3 command allows one to calculate one complex vectors for each of the atoms in your system that describe the degree of order in the coordination sphere
 around a particular atom. The difficulty with these vectors comes when combining the order parameters from all of the individual atoms/molecules so as to get a
diff --git a/src/crystallization/Q4.cpp b/src/crystallization/Q4.cpp
index 8f5213e342af0d8dcd5e7fe5600d8700b6ef2902..79565db2024d46bc61b3aefe86c67dfd48eb6283 100644
--- a/src/crystallization/Q4.cpp
+++ b/src/crystallization/Q4.cpp
@@ -88,8 +88,7 @@ PRINT ARG=q4.mean FILE=colvar
 
 //+PLUMEDOC MCOLVARF LOCAL_Q4
 /*
-Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_4\f$ vector on the central atom and the \f$q_4\f$ vector
-on the atoms in the first coordination sphere.
+Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_4\f$ vector on the central atom and the \f$q_4\f$ vector on the atoms in the first coordination sphere.
 
 The \ref Q4 command allows one to calculate one complex vectors for each of the atoms in your system that describe the degree of order in the coordination sphere
 around a particular atom. The difficulty with these vectors comes when combining the order parameters from all of the individual atoms/molecules so as to get a
diff --git a/src/crystallization/Q6.cpp b/src/crystallization/Q6.cpp
index 23882bd4497bfddcca11df84acafe8771509be47..a784607b1162306bdb0837db8c55dbdd29ffeacc 100644
--- a/src/crystallization/Q6.cpp
+++ b/src/crystallization/Q6.cpp
@@ -88,8 +88,7 @@ PRINT ARG=q6.mean FILE=colvar
 
 //+PLUMEDOC MCOLVARF LOCAL_Q6
 /*
-Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_6\f$ vector on the central atom and the \f$q_6\f$ vector
-on the atoms in the first coordination sphere.
+Calculate the local degree of order around an atoms by taking the average dot product between the \f$q_6\f$ vector on the central atom and the \f$q_6\f$ vector on the atoms in the first coordination sphere.
 
 The \ref Q6 command allows one to calculate one complex vectors for each of the atoms in your system that describe the degree of order in the coordination sphere
 around a particular atom. The difficulty with these vectors comes when combining the order parameters from all of the individual atoms/molecules so as to get a
diff --git a/src/crystallization/SimpleCubic.cpp b/src/crystallization/SimpleCubic.cpp
index 484c00e7be4704d48a3a1fc76fc0e07362fa101e..ef3253e343e3b87c71db1b401b56eb93f61eb669 100644
--- a/src/crystallization/SimpleCubic.cpp
+++ b/src/crystallization/SimpleCubic.cpp
@@ -32,8 +32,7 @@ namespace crystallization {
 
 //+PLUMEDOC MCOLVAR SIMPLECUBIC
 /*
-Calculate whether or not the coordination spheres of atoms are arranged as they would be in a simple
-cubic structure.
+Calculate whether or not the coordination spheres of atoms are arranged as they would be in a simple cubic structure.
 
 We can measure how similar the environment around atom \f$i\f$ is to a simple cubic structure is by evaluating
 the following quantity:
diff --git a/src/drr/DynamicReferenceRestraining.cpp b/src/drr/DynamicReferenceRestraining.cpp
index 3ba641e3669b38e05907708eae20b77194694e44..7d732933cbe4dbb3738292212e83eef5524239b0 100644
--- a/src/drr/DynamicReferenceRestraining.cpp
+++ b/src/drr/DynamicReferenceRestraining.cpp
@@ -46,8 +46,8 @@ namespace drr {
 //+PLUMEDOC EABFMOD_BIAS DRR
 /*
 Used to performed extended-system adaptive biasing force(eABF) \cite Lelievre2007 method
-on one or more collective variables. This method is also
-called dynamic reference restraining(DRR) \cite Zheng2012 .
+ on one or more collective variables. This method is also
+ called dynamic reference restraining(DRR) \cite Zheng2012 .
 
 For each collective variable \f$\xi_i\f$, a fictitious variable \f$\lambda_i\f$
 is attached through a spring. The fictitious variable \f$\lambda_i\f$ undergoes
diff --git a/src/function/Custom.cpp b/src/function/Custom.cpp
index 21cd196b5a068d57a1bbcf7bbacda85099397f0d..03fc9936caec2a4c722b000744a34a07608be6f5 100644
--- a/src/function/Custom.cpp
+++ b/src/function/Custom.cpp
@@ -171,6 +171,8 @@ class Custom :
   string func;
   vector<double> values;
   vector<char*> names;
+  vector<double*> lepton_ref;
+  vector<double*> lepton_ref_deriv;
 public:
   explicit Custom(const ActionOptions&);
   void calculate();
@@ -221,7 +223,9 @@ Custom::Custom(const ActionOptions&ao):
   Function(ao),
   expression_deriv(getNumberOfArguments()),
   values(getNumberOfArguments()),
-  names(getNumberOfArguments())
+  names(getNumberOfArguments()),
+  lepton_ref(getNumberOfArguments(),nullptr),
+  lepton_ref_deriv(getNumberOfArguments()*getNumberOfArguments(),nullptr)
 {
   parseVector("VAR",var);
   if(var.size()==0) {
@@ -257,27 +261,36 @@ Custom::Custom(const ActionOptions&ao):
     log<<"    "<<pe<<"\n";
     expression_deriv[i]=pe.createCompiledExpression();
   }
-}
 
-void Custom::calculate() {
   for(unsigned i=0; i<getNumberOfArguments(); i++) {
     try {
-      expression.getVariableReference(var[i])=getArgument(i);
+      lepton_ref[i]=&expression.getVariableReference(var[i]);
     } catch(const PLMD::lepton::Exception& exc) {
 // this is necessary since in some cases lepton things a variable is not present even though it is present
 // e.g. func=0*x
     }
   }
-  setValue(expression.evaluate());
   for(unsigned i=0; i<getNumberOfArguments(); i++) {
     for(unsigned j=0; j<getNumberOfArguments(); j++) {
       try {
-        expression_deriv[i].getVariableReference(var[j])=getArgument(j);
+        lepton_ref_deriv[i*getNumberOfArguments()+j]=&expression_deriv[i].getVariableReference(var[j]);
       } catch(const PLMD::lepton::Exception& exc) {
 // this is necessary since in some cases lepton things a variable is not present even though it is present
 // e.g. func=0*x
       }
     }
+  }
+}
+
+void Custom::calculate() {
+  for(unsigned i=0; i<getNumberOfArguments(); i++) {
+    if(lepton_ref[i]) *lepton_ref[i]=getArgument(i);
+  }
+  setValue(expression.evaluate());
+  for(unsigned i=0; i<getNumberOfArguments(); i++) {
+    for(unsigned j=0; j<getNumberOfArguments(); j++) {
+      if(lepton_ref_deriv[i*getNumberOfArguments()+j]) *lepton_ref_deriv[i*getNumberOfArguments()+j]=getArgument(j);
+    }
     setDerivative(i,expression_deriv[i].evaluate());
   }
 }
diff --git a/src/function/FuncSumHills.cpp b/src/function/FuncSumHills.cpp
index eefe498a41016d9ef50c35ff7d55f35968a2d786..ce5d2ce4b82e127c36c0c5bbc5e3e6decca8b60b 100644
--- a/src/function/FuncSumHills.cpp
+++ b/src/function/FuncSumHills.cpp
@@ -40,10 +40,7 @@ namespace function {
 
 //+PLUMEDOC FUNCTION FUNCSUMHILLS
 /*
-This function is intended to be called by the command line tool sum_hills
-and it is meant to integrate a HILLS file or an HILLS file interpreted as
-a histogram i a variety of ways. Therefore it is not expected that you use this
-during your dynamics (it will crash!)
+This function is intended to be called by the command line tool sum_hills.  It is meant to integrate a HILLS file or an HILLS file interpreted as a histogram in a variety of ways. It is, therefore, not expected that you use this during your dynamics (it will crash!)
 
 In the future one could implement periodic integration during the metadynamics
 or straightforward MD as a tool to check convergence
diff --git a/src/function/Piecewise.cpp b/src/function/Piecewise.cpp
index b5ee68b96a4708aac22044c81693b0db0a4d1e26..3615f47c0efcb69822a945b4bf37bc9c1575d512 100644
--- a/src/function/Piecewise.cpp
+++ b/src/function/Piecewise.cpp
@@ -31,8 +31,7 @@ namespace function {
 
 //+PLUMEDOC FUNCTION PIECEWISE
 /*
-Compute a piece wise straight line through its arguments that passes through
-a set of ordered control points.
+Compute a piece wise straight line through its arguments that passes through a set of ordered control points.
 
 For variables less than the first
 (greater than the last) point, the value of the first (last) point is used.
diff --git a/src/function/Stats.cpp b/src/function/Stats.cpp
index 26101a302bd248d02261317cd434568543d1d1d5..0834226f2589fff829ef39addf7ecf043fae427e 100644
--- a/src/function/Stats.cpp
+++ b/src/function/Stats.cpp
@@ -30,7 +30,8 @@ namespace function {
 //+PLUMEDOC FUNCTION STATS
 /*
 Calculates statistical properties of a set of collective variables with respect to a set of reference values.
-In particular it calculates and store as components the sum of the squared deviations, the correlation, the
+
+In particular it calculates and stores as components the sum of the squared deviations, the correlation, the
 slope and the intercept of a linear fit.
 
 The reference values can be either provided as values using PARAMETERS or using value without derivatives
diff --git a/src/generic/Debug.cpp b/src/generic/Debug.cpp
index 066b2c50c6d269ec58daa9016e317ded99419a00..6edd2ec8deec872200258434ecea7266e2316027 100644
--- a/src/generic/Debug.cpp
+++ b/src/generic/Debug.cpp
@@ -115,7 +115,7 @@ void Debug::apply() {
       if(p->isActive()) a++;
     };
     if(a>0) {
-      ofile.printf("activity at step %i: ",getStep());
+      ofile<<"activity at step "<<getStep()<<": ";
       for(const auto & p : actionSet) {
         if(dynamic_cast<Debug*>(p.get()))continue;
         if(p->isActive()) ofile.printf("+");
@@ -125,7 +125,7 @@ void Debug::apply() {
     };
   };
   if(logRequestedAtoms) {
-    ofile.printf("requested atoms at step %i: ",getStep());
+    ofile<<"requested atoms at step "<<getStep()<<": ";
     int* l;
     int n;
     plumed.cmd("createFullList",&n);
diff --git a/src/generic/Group.cpp b/src/generic/Group.cpp
index 69e9d3cd48667835b1e8584326931ba43f204a50..936f6709dc979b12fff699cbdcf312d63716bc18 100644
--- a/src/generic/Group.cpp
+++ b/src/generic/Group.cpp
@@ -36,8 +36,7 @@ namespace generic {
 
 //+PLUMEDOC GENERIC GROUP
 /*
-Define a group of atoms so that a particular list of atoms can be referenced with a single label
-in definitions of CVs or virtual atoms.
+Define a group of atoms so that a particular list of atoms can be referenced with a single label in definitions of CVs or virtual atoms.
 
 Atoms can be listed as comma separated numbers (i.e. `1,2,3,10,45,7,9`) , simple positive ranges
 (i.e. `20-40`), ranges with a stride either positive or negative (i.e. `20-40:2` or `80-50:-2`) or as
diff --git a/src/generic/WholeMolecules.cpp b/src/generic/WholeMolecules.cpp
index c224e9a721842225fecb2ae833fc04d1aff877e3..1ead3d79189be79a0662a50f2b5d5c055b5af2a9 100644
--- a/src/generic/WholeMolecules.cpp
+++ b/src/generic/WholeMolecules.cpp
@@ -40,8 +40,7 @@ namespace generic {
 
 //+PLUMEDOC GENERIC WHOLEMOLECULES
 /*
-This action is used to rebuild molecules that can become split by the periodic
-boundary conditions.
+This action is used to rebuild molecules that can become split by the periodic boundary conditions.
 
 It is similar to the ALIGN_ATOMS keyword of plumed1, and is needed since some
 MD dynamics code (e.g. GROMACS) can break molecules during the calculation.
diff --git a/src/gridtools/GridVessel.cpp b/src/gridtools/GridVessel.cpp
index 6f0d00cf2bfc675b43c783fa8f859fe47a8c636c..67bfc582b8dab55f246f0b3582e7a1d4d8e31fb6 100644
--- a/src/gridtools/GridVessel.cpp
+++ b/src/gridtools/GridVessel.cpp
@@ -200,7 +200,11 @@ void GridVessel::getIndices( const std::vector<double>& point, std::vector<unsig
   for(unsigned i=0; i<dimension; ++i) {
     indices[i]=std::floor( (point[i] - min[i])/dx[i] );
     if( pbc[i] ) indices[i]=indices[i]%nbin[i];
-    else if( indices[i]>nbin[i] ) plumed_merror("point is outside grid range");
+    else if( indices[i]>nbin[i] ) {
+      std::string pp, num; Tools::convert( point[0], pp );
+      for(unsigned j=1; j<point.size(); ++j) { Tools::convert( point[j], num ); pp += ", " + num; }
+      plumed_merror("point (" + pp + ")  is outside grid range");
+    }
   }
 }
 
diff --git a/src/isdb/CS2Backbone.cpp b/src/isdb/CS2Backbone.cpp
index 99a90ee7a3230c1c01a77a912196990f86efb334..90c42a4c833cfea4617102f0d4fc8836abbc3da9 100644
--- a/src/isdb/CS2Backbone.cpp
+++ b/src/isdb/CS2Backbone.cpp
@@ -631,7 +631,7 @@ CS2Backbone::CS2Backbone(const ActionOptions&ao):
     }
   }
 
-  requestAtoms(used_atoms);
+  requestAtoms(used_atoms, false);
   setDerivatives();
   checkRead();
 }
diff --git a/src/isdb/Caliber.cpp b/src/isdb/Caliber.cpp
index 72d0d2dd8907275a93b29d8d295d6526f4e91663..5076e9d3e8e631ccf52fda93c1c32365e041b12a 100644
--- a/src/isdb/Caliber.cpp
+++ b/src/isdb/Caliber.cpp
@@ -33,6 +33,7 @@ namespace isdb {
 //+PLUMEDOC ISDB_BIAS CALIBER
 /*
 Add a time-dependent, harmonic restraint on one or more variables.
+
 This allows implementing a maximum caliber restraint on one or more experimental time series by replica-averaged restrained simulations.
 See \cite Capelli:2018jt .
 
diff --git a/src/isdb/Jcoupling.cpp b/src/isdb/Jcoupling.cpp
index 38ce2b03241dfc7e890238c69bc08779c6b9b115..46a02f155c7b65affd6cc324a21527267c4058b7 100644
--- a/src/isdb/Jcoupling.cpp
+++ b/src/isdb/Jcoupling.cpp
@@ -289,7 +289,7 @@ JCoupling::JCoupling(const ActionOptions&ao):
     }
   }
 
-  requestAtoms(atoms);
+  requestAtoms(atoms, false);
   if(getDoScore()) {
     setParameters(coupl);
     Initialise(ncoupl_);
diff --git a/src/isdb/Metainference.cpp b/src/isdb/Metainference.cpp
index 27e2c6c00501c8c5d35f4865a565cdf71f322657..5fd033b84d648d648d79c247c62c558130ca0c0f 100644
--- a/src/isdb/Metainference.cpp
+++ b/src/isdb/Metainference.cpp
@@ -81,7 +81,7 @@ SIGMA_BIAS is an uncertainty parameter, sampled by a MC algorithm in the bounded
 defined by SIGMA_MIN and SIGMA_MAX. The initial value is set at SIGMA0. The MC move is a
 random displacement of maximum value equal to DSIGMA. If the number of data point is
 too large and the acceptance rate drops it is possible to make the MC move over mutually
-exclusive, random subset of size MC_CHUNKSIZE and run more than one move setting MC_STRIDE
+exclusive, random subset of size MC_CHUNKSIZE and run more than one move setting MC_STEPS
 in such a way that MC_CHUNKSIZE*MC_STEPS will cover all the data points.
 
 Calculated and experimental data can be compared modulo a scaling factor and/or an offset
@@ -314,8 +314,8 @@ void Metainference::registerKeywords(Keywords& keys) {
   keys.addOutputComponent("acceptScale",  "SCALEDATA",    "MC acceptance");
   keys.addOutputComponent("weight",       "REWEIGHT",     "weights of the weighted average");
   keys.addOutputComponent("biasDer",      "REWEIGHT",     "derivatives with respect to the bias");
-  keys.addOutputComponent("scale",        "default",      "scale parameter");
-  keys.addOutputComponent("offset",       "default",      "offset parameter");
+  keys.addOutputComponent("scale",        "SCALEDATA",    "scale parameter");
+  keys.addOutputComponent("offset",       "ADDOFFSET",    "offset parameter");
   keys.addOutputComponent("ftilde",       "GENERIC",      "ensemble average estimator");
 }
 
@@ -526,28 +526,43 @@ Metainference::Metainference(const ActionOptions&ao):
   vector<double> readsigma;
   parseVector("SIGMA0",readsigma);
   if((noise_type_!=MGAUSS&&noise_type_!=MOUTLIERS&&noise_type_!=GENERIC)&&readsigma.size()>1)
-    error("If you want to use more than one SIGMA you should use NOISETYPE=MGAUSS|MOUTLIERS");
+    error("If you want to use more than one SIGMA you should use NOISETYPE=MGAUSS|MOUTLIERS|GENERIC");
   if(noise_type_==MGAUSS||noise_type_==MOUTLIERS||noise_type_==GENERIC) {
-    if(readsigma.size()==narg) {
-      sigma_.resize(narg);
-      sigma_=readsigma;
-    } else if(readsigma.size()==1) {
-      sigma_.resize(narg,readsigma[0]);
-    } else {
-      error("SIGMA0 can accept either one single value or as many values as the number of arguments (with NOISETYPE=MGAUSS|MOUTLIERS)");
-    }
+    sigma_.resize(readsigma.size());
+    sigma_=readsigma;
   } else sigma_.resize(1, readsigma[0]);
 
-  double read_smin_;
-  parse("SIGMA_MIN",read_smin_);
-  sigma_min_.resize(sigma_.size(),read_smin_);
-  double read_smax_;
-  parse("SIGMA_MAX",read_smax_);
-  sigma_max_.resize(sigma_.size(),read_smax_);
-  double read_dsigma_=-1.;
-  parse("DSIGMA",read_dsigma_);
-  if(read_dsigma_<0) read_dsigma_ = 0.05*(read_smax_ - read_smin_);
-  Dsigma_.resize(sigma_.size(),read_dsigma_);
+  vector<double> readsigma_min;
+  parseVector("SIGMA_MIN",readsigma_min);
+  if((noise_type_!=MGAUSS&&noise_type_!=MOUTLIERS&&noise_type_!=GENERIC)&&readsigma_min.size()>1)
+    error("If you want to use more than one SIGMA you should use NOISETYPE=MGAUSS|MOUTLIERS|GENERIC");
+  if(noise_type_==MGAUSS||noise_type_==MOUTLIERS||noise_type_==GENERIC) {
+    sigma_min_.resize(readsigma_min.size());
+    sigma_min_=readsigma_min;
+  } else sigma_min_.resize(1, readsigma_min[0]);
+
+  vector<double> readsigma_max;
+  parseVector("SIGMA_MAX",readsigma_max);
+  if((noise_type_!=MGAUSS&&noise_type_!=MOUTLIERS&&noise_type_!=GENERIC)&&readsigma_max.size()>1)
+    error("If you want to use more than one SIGMA you should use NOISETYPE=MGAUSS|MOUTLIERS|GENERIC");
+  if(noise_type_==MGAUSS||noise_type_==MOUTLIERS||noise_type_==GENERIC) {
+    sigma_max_.resize(readsigma_max.size());
+    sigma_max_=readsigma_max;
+  } else sigma_max_.resize(1, readsigma_max[0]);
+
+  if(sigma_max_.size()!=sigma_min_.size()) error("The number of values for SIGMA_MIN and SIGMA_MAX must be the same");
+
+  vector<double> read_dsigma;
+  parseVector("DSIGMA",read_dsigma);
+  if((noise_type_!=MGAUSS&&noise_type_!=MOUTLIERS&&noise_type_!=GENERIC)&&readsigma_max.size()>1)
+    error("If you want to use more than one SIGMA you should use NOISETYPE=MGAUSS|MOUTLIERS|GENERIC");
+  if(read_dsigma.size()>0) {
+    Dsigma_.resize(read_dsigma.size());
+    Dsigma_=read_dsigma;
+  } else {
+    Dsigma_.resize(sigma_max_.size());
+    for(unsigned i=0; i<sigma_max_.size(); i++) Dsigma_[i] = 0.05*(sigma_max_[i] - sigma_min_[i]);
+  }
 
   // monte carlo stuff
   parse("MC_STEPS",MCsteps_);
@@ -564,6 +579,34 @@ Metainference::Metainference(const ActionOptions&ao):
 
   checkRead();
 
+  // set sigma_bias
+  if(noise_type_==MGAUSS||noise_type_==MOUTLIERS||noise_type_==GENERIC) {
+    if(sigma_.size()==1) {
+      double tmp = sigma_[0];
+      sigma_.resize(narg, tmp);
+    } else if(sigma_.size()>1&&sigma_.size()!=narg) {
+      error("SIGMA0 can accept either one single value or as many values as the number of arguments (with NOISETYPE=MGAUSS|MOUTLIERS|GENERIC)");
+    }
+    if(sigma_min_.size()==1) {
+      double tmp = sigma_min_[0];
+      sigma_min_.resize(narg, tmp);
+    } else if(sigma_min_.size()>1&&sigma_min_.size()!=narg) {
+      error("SIGMA_MIN can accept either one single value or as many values as the number of arguments (with NOISETYPE=MGAUSS|MOUTLIERS|GENERIC)");
+    }
+    if(sigma_max_.size()==1) {
+      double tmp = sigma_max_[0];
+      sigma_max_.resize(narg, tmp);
+    } else if(sigma_max_.size()>1&&sigma_max_.size()!=narg) {
+      error("SIGMA_MAX can accept either one single value or as many values as the number of arguments (with NOISETYPE=MGAUSS|MOUTLIERS|GENERIC)");
+    }
+    if(Dsigma_.size()==1) {
+      double tmp = Dsigma_[0];
+      Dsigma_.resize(narg, tmp);
+    } else if(Dsigma_.size()>1&&Dsigma_.size()!=narg) {
+      error("DSIGMA can accept either one single value or as many values as the number of arguments (with NOISETYPE=MGAUSS|MOUTLIERS|GENERIC)");
+    }
+  }
+
   IFile restart_sfile;
   restart_sfile.link(*this);
   if(getRestart()&&restart_sfile.FileExist(status_file_name_)) {
@@ -683,9 +726,15 @@ Metainference::Metainference(const ActionOptions&ao):
   log.printf("  initial data uncertainties");
   for(unsigned i=0; i<sigma_.size(); ++i) log.printf(" %f", sigma_[i]);
   log.printf("\n");
-  log.printf("  minimum data uncertainty %f\n",read_smin_);
-  log.printf("  maximum data uncertainty %f\n",read_smax_);
-  log.printf("  maximum MC move of data uncertainty %f\n",read_dsigma_);
+  log.printf("  minimum data uncertainties");
+  for(unsigned i=0; i<sigma_.size(); ++i) log.printf(" %f",sigma_min_[i]);
+  log.printf("\n");
+  log.printf("  maximum data uncertainties");
+  for(unsigned i=0; i<sigma_.size(); ++i) log.printf(" %f",sigma_max_[i]);
+  log.printf("\n");
+  log.printf("  maximum MC move of data uncertainties");
+  for(unsigned i=0; i<sigma_.size(); ++i) log.printf(" %f",Dsigma_[i]);
+  log.printf("\n");
   log.printf("  temperature of the system %f\n",kbt_);
   log.printf("  MC steps %u\n",MCsteps_);
   log.printf("  MC stride %u\n",MCstride_);
diff --git a/src/isdb/MetainferenceBase.h b/src/isdb/MetainferenceBase.h
index c1b6d20e839df68bd6eed8c9c337a0d72c3c1d50..de13f38453611b080a7ca15408d3572b8ff4da87 100644
--- a/src/isdb/MetainferenceBase.h
+++ b/src/isdb/MetainferenceBase.h
@@ -312,7 +312,7 @@ void MetainferenceBase::unlockRequests() {
 }
 
 inline
-void MetainferenceBase::calculateNumericalDerivatives( ActionWithValue* a ) {
+void MetainferenceBase::calculateNumericalDerivatives( ActionWithValue* a=NULL ) {
   if( getNumberOfArguments()>0 ) {
     ActionWithArguments::calculateNumericalDerivatives( a );
   }
diff --git a/src/isdb/NOE.cpp b/src/isdb/NOE.cpp
index 7d9102bd9b33666645eac75888329a9cb5d9b3d0..901353954d3da0460681bbacd5e2c7f68931a1d1 100644
--- a/src/isdb/NOE.cpp
+++ b/src/isdb/NOE.cpp
@@ -196,7 +196,7 @@ NOE::NOE(const ActionOptions&ao):
     }
   }
 
-  requestAtoms(nl->getFullAtomList());
+  requestAtoms(nl->getFullAtomList(), false);
   if(getDoScore()) {
     setParameters(noedist);
     Initialise(nga.size());
diff --git a/src/isdb/PRE.cpp b/src/isdb/PRE.cpp
index 692be2b857e0d9d2b5ac272f1fcbab488bd68784..15457557df3bf3cdbe48dcb3b205606bdf5fc0c0 100644
--- a/src/isdb/PRE.cpp
+++ b/src/isdb/PRE.cpp
@@ -236,7 +236,7 @@ PRE::PRE(const ActionOptions&ao):
     }
   }
 
-  requestAtoms(nl->getFullAtomList());
+  requestAtoms(nl->getFullAtomList(), false);
   if(getDoScore()) {
     setParameters(exppre);
     Initialise(nga.size());
diff --git a/src/isdb/RDC.cpp b/src/isdb/RDC.cpp
index 9090472602217dfd551f69c80b151dc5347b0385..2e83e0534b3d5961ae465b24f6591e0fe7887715 100644
--- a/src/isdb/RDC.cpp
+++ b/src/isdb/RDC.cpp
@@ -328,7 +328,7 @@ RDC::RDC(const ActionOptions&ao):
     addComponent("Syz"); componentIsNotPeriodic("Syz");
   }
 
-  requestAtoms(atoms);
+  requestAtoms(atoms, false);
   if(getDoScore()) {
     setParameters(coupl);
     Initialise(coupl.size());
diff --git a/src/isdb/Rescale.cpp b/src/isdb/Rescale.cpp
index aaa39ef969d1c9b554be5955656e20ce2e0b0af1..4965ac873654b4e111be79969e1e066d00f55246 100644
--- a/src/isdb/Rescale.cpp
+++ b/src/isdb/Rescale.cpp
@@ -278,7 +278,7 @@ Rescale::Rescale(const ActionOptions&ao):
   log.printf("  name of the SELECTOR use for this action %s\n",selector_.c_str());
   log.printf("  number of bins in grid %u\n",nbin);
   log.printf("  number of arguments that will not be scaled %u\n",nores_);
-  if(nrep_>1) log.printf("  number of arguments that will not be summed across replicas %u\n",not_shared.size());
+  if(nrep_>1) log<<"  number of arguments that will not be summed across replicas "<<not_shared.size()<<"\n";
   log.printf("  biasfactor %f\n",biasf_);
   log.printf("  initial hills height %f\n",w0_);
   log.printf("  stride to write bias to file %u\n",Biasstride_);
diff --git a/src/isdb/SAXS.cpp b/src/isdb/SAXS.cpp
index ba7c3bbe63b00d8f2cfb59f6530e86617a3259d4..2c3ae440692bf9eda7b8d2eeae39355eaf3462f8 100644
--- a/src/isdb/SAXS.cpp
+++ b/src/isdb/SAXS.cpp
@@ -113,20 +113,18 @@ class SAXS :
   public MetainferenceBase
 {
 private:
-  bool                     pbc;
-  bool                     serial;
-  bool                     bessel;
-  bool                     force_bessel;
-  bool                     gpu;
-  int                      deviceid;
-  vector<double>           q_list;
-  vector<double>           FF_rank;
-  vector<vector<double> >  FF_value;
-#ifdef  __PLUMED_HAS_ARRAYFIRE
-  af::array                AFF_value;
-#endif
-  vector<double>           avals;
-  vector<double>           bvals;
+  bool                       pbc;
+  bool                       serial;
+  bool                       bessel;
+  bool                       force_bessel;
+  bool                       gpu;
+  int                        deviceid;
+  vector<double>             q_list;
+  vector<double>             FF_rank;
+  vector<vector<double> >    FF_value;
+  vector<vector<float> >     FFf_value;
+  vector<double>             avals;
+  vector<double>             bvals;
 
   void calculate_gpu(vector<Vector> &deriv);
   void calculate_cpu(vector<Vector> &deriv);
@@ -312,17 +310,12 @@ SAXS::SAXS(const ActionOptions&ao):
       }
     }
   } else {
-    vector<float> FF_new;
-    FF_new.resize(numq*size);
+    FFf_value.resize(numq,vector<float>(size));
     for(unsigned k=0; k<numq; ++k) {
       for(unsigned i=0; i<size; i++) {
-        FF_new[k+i*numq] = static_cast<float>(FF_tmp[k][i]/sqrt(scale_int));
+        FFf_value[k][i] = static_cast<float>(FF_tmp[k][i])/sqrt(scale_int);
       }
     }
-#ifdef __PLUMED_HAS_ARRAYFIRE
-    af::array allFFa = af::array(numq, size, &FF_new.front());
-    AFF_value = allFFa;
-#endif
   }
 
   if(!getDoScore()) {
@@ -363,7 +356,7 @@ SAXS::SAXS(const ActionOptions&ao):
   log<<"  Bibliography ";
   if(martini) {
     log<<plumed.cite("Niebling, BjĂ¶rling, Westenhoff, J Appl Crystallogr 47, 1190â€“1198 (2014).");
-    log<<plumed.cite("Paissoni, Jussupow, Camilloni, bioRxiv 498147.");
+    log<<plumed.cite("Paissoni, Jussupow, Camilloni, J Appl Crystallogr 52, 394-402 (2019).");
   }
   if(atomistic) {
     log<<plumed.cite("Fraser, MacRae, Suzuki, J. Appl. Crystallogr., 11, 693â€“694 (1978).");
@@ -373,7 +366,7 @@ SAXS::SAXS(const ActionOptions&ao):
   log<< plumed.cite("Bonomi, Camilloni, Bioinformatics, 33, 3999 (2017)");
   log<<"\n";
 
-  requestAtoms(atoms);
+  requestAtoms(atoms, false);
   if(getDoScore()) {
     setParameters(expint);
     Initialise(numq);
@@ -436,8 +429,10 @@ void SAXS::calculate_gpu(vector<Vector> &deriv)
 
     for (unsigned k=0; k<numq; k++) {
       // calculate FF matrix
+      // size,1,1,1
+      af::array AFF_value(size, &FFf_value[k].front());
       // size,size,1,1
-      af::array FFdist_mod = af::tile(af::moddims(AFF_value(k, af::span), size, 1), 1, size)*af::tile(AFF_value(k, af::span), size, 1);
+      af::array FFdist_mod = af::tile(AFF_value(af::span), 1, size)*af::transpose(af::tile(AFF_value(af::span), 1, size));
 
       // get q
       const float qvalue = static_cast<float>(q_list[k]);
@@ -761,7 +756,8 @@ void SAXS::setup_midl(vector<double> &r_polar, vector<Vector2d> &qRnm, int &algo
   if(algorithm==-1) log.printf("BESSEL is suboptimal for this system and is being disabled, unless FORCE_BESSEL is used\n");
   if(force_bessel) algorithm=numq-1;
 
-  qRnm.resize(p2*size);
+  unsigned qRnm_size = p2*size;
+  qRnm.resize(qRnm_size);
   //as the legndre polynomials and the exponential term in the basis set expansion are not function of the scattering wavenumber, they can be precomputed
   for(unsigned i=rank; i<size; i+=stride) {
     for(int n=0; n<truncation; n++) {
diff --git a/src/lib/Makefile b/src/lib/Makefile
index 291942f18de8fa7a6e94488bf3d1d6aefee3b56a..e6df6c5c96e5ca6062ede14989669091449eab4c 100644
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@@ -313,7 +313,7 @@ ifdef python_bin
 	cp -pr ../../python install 
 	cd install/python && rm -fr *.so plumed.cpp build && \
           unset CXX && unset CC && unset CFLAGS && unset CXXFLAGS && unset LDSHARED && \
-          plumed_program_name="$(program_name)" \
+          plumed_program_name=plumed \
           plumed_version=$(VERSION) \
           plumed_include_dir=../../../wrapper \
           plumed_default_kernel="$(libdir)/lib$(program_name)Kernel.$(SOEXT)" \
diff --git a/src/logmfd/LogMFD.cpp b/src/logmfd/LogMFD.cpp
index cff9631ad61717128e29e59435e0f213c2053bb1..433eb739b9b68b6c9ade7a3aaa3acfa5de0b4720 100644
--- a/src/logmfd/LogMFD.cpp
+++ b/src/logmfd/LogMFD.cpp
@@ -30,7 +30,7 @@ Consider a physical system of \f$N_q\f$ particles, for which the Hamiltonian is
   {H_{\rm MD}}\left( {\bf{\Gamma}} \right) = \sum\limits_{j = 1}^{{N_q}} {\frac{{{\bf{p}}_j^2}}{{2{m_j}}}}  + \Phi \left( {\bf{q}} \right)
 \f]
 
-where \f${\bf q}_j\f$, \f${\bf p}_j\f$ (\f$\bf{\Gamma}={\bf q},{\bf p}\f$), and \f$m_j\f$ are the position, momentum, and mass of the \f$j\f$th particle, respectively,
+where \f${\bf q}_j\f$, \f${\bf p}_j\f$ (\f$\bf{\Gamma}={\bf q},{\bf p}\f$), and \f$m_j\f$ are the position, momentum, and mass of particle \f$j\f$ respectively,
 and \f$\Phi\f$ is the potential energy function for \f${\bf q}\f$.
 The free energy \f$F({\bf X})\f$ as a function of a set of \f$N\f$ collective variables (CVs) is given as
 
@@ -46,7 +46,7 @@ and \f$K_i\f$ is the spring constant which is large enough to invoke
  \delta \left( x \right) = \lim_{k \to \infty } \sqrt {\beta k/2\pi} \exp \left( -\beta kx^2/2 \right)
 \f]
 
-In mean-force dynamics (MFD), \f${\bf X}\f$ are treated as fictitious dynamical variables, which are associated with the following Hamiltonian,
+In mean-force dynamics, \f${\bf X}\f$ are treated as fictitious dynamical variables, which are associated with the following Hamiltonian,
 
 \f[
  {H_{\rm log}} = \sum\limits_{i = 1}^N {\frac{{P_{{X_i}}^2}}{{2{M_i}}} + \Psi \left( {{\bf X}} \right)}
@@ -64,7 +64,7 @@ which corresponds to TAMD/d-AFED \cite AbramsJ2008, \cite Maragliano2006 (or the
 
 where \f$\alpha\f$ (ALPHA) and \f$\gamma\f$ (GAMMA) are positive parameters. The logarithmic form of \f$\Psi_{\rm log}\f$ ensures the dynamics of \f${\bf X}\f$ on a much smoother energy surface [i.e., \f$\Psi_{\rm log}({\bf X})\f$] than \f$F({\bf X})\f$, thus enhancing the sampling in the \f${\bf X}\f$-space. The parameters \f$\alpha\f$ and \f$\gamma\f$ determine the degree of flatness of \f$\Psi_{\rm log}\f$, but adjusting only \f$\alpha\f$ is normally sufficient to have a relatively flat surface (with keeping the relation \f$\gamma=1/\alpha\f$).
 
-The equation of motion (EOM) for \f$X_i\f$ in LogMFD (no thermostat) is
+The equation of motion for \f$X_i\f$ in LogMFD (no thermostat) is
 
 \f[
  {M_i}{\ddot X_i} =  - \left( {\frac{{\alpha \gamma }}{{\alpha F + 1}}} \right)\frac{{\partial F}}{{\partial {X_i}}}
@@ -83,7 +83,7 @@ where
  Z = \int {\exp \left[ { - \beta \left\{ {{H_{\rm MD}} + \sum\limits_i^N {\frac{{{K_i}}}{2}{{\left( {{s_i}\left( {{\bf q}} \right) - {X_i}} \right)}^2}} } \right\}} \right]} d{\bf{\Gamma }}
 \f]
 
-The mean-force (MF) is practically evaluated by performing a shot-time canonical MD run each time \f${\bf X}\f$ is updated according to the EOM for \f${\bf X}\f$.
+The mean-force (MF) is practically evaluated by performing a shot-time canonical MD run each time \f${\bf X}\f$ is updated according to the equation of motion for \f${\bf X}\f$.
 
 If the canonical average for the MF is effectively converged, the dynamical variables \f${\bf q}\f$ and \f${\bf X}\f$ are decoupled and they evolve adiabatically, which can be exploited for the on-the-fly evaluation of \f$F({\bf X})\f$. I.e., \f$H_{\rm log}\f$ should be a constant of motion in this case, thus \f$F({\bf X})\f$ can be evaluated each time \f${\bf X}\f$ is updated as
 
@@ -94,7 +94,7 @@ If the canonical average for the MF is effectively converged, the dynamical vari
 \f]
 
 
-This means that \f$F({\bf X})\f$ can be constructed without postprocessing (on-the-fly free energy reconstruction). Note that the on-the-fly free energy reconstruction is also possible in TAMD/d-AFED if the Hamiltonian-like conserved quantity is available (e.g., the Nose-Hoover type dynamics).
+This means that \f$F({\bf X})\f$ can be constructed without post processing (on-the-fly free energy reconstruction). Note that the on-the-fly free energy reconstruction is also possible in TAMD/d-AFED if the Hamiltonian-like conserved quantity is available (e.g., the Nose-Hoover type dynamics).
 
 
 
@@ -104,7 +104,7 @@ This means that \f$F({\bf X})\f$ can be constructed without postprocessing (on-t
 The accuracy in the MF is critical to the on-the-fly free energy reconstruction. To improve the evaluation of the MF, parallel-dynamics (PD) is incorporated into LogMFD, leading to logarithmic parallel-dynamics (LogPD) \cite MorishitaLogPD.
 
 
-In PD, the MF is evaluated by a nonequilibrium path-ensemble based on the Crooks-Jarzynski nonequilibrium work relation. To this end, multiple replicas of the MD system which run in parallel are introduced. The CVs [\f${\bf s}({\bf q})\f$] in each replica is restrained to the same value of \f${\bf X}(t)\f$. A canonical MD run with \f$N_m\f$ steps is performed in each replica, then the MF on \f$X_i\f$ is evaluated using the MD trajectories from all replicas.
+In PD, the MF is evaluated by a non-equilibrium path-ensemble based on the Crooks-Jarzynski non-equilibrium work relation. To this end, multiple replicas of the MD system which run in parallel are introduced. The CVs [\f${\bf s}({\bf q})\f$] in each replica is restrained to the same value of \f${\bf X}(t)\f$. A canonical MD run with \f$N_m\f$ steps is performed in each replica, then the MF on \f$X_i\f$ is evaluated using the MD trajectories from all replicas.
 The MF is practically calculated as
 
 
@@ -132,9 +132,9 @@ where
  H_{\rm MD}^k\left( {{\bf{\Gamma }},{{\bf X}}} \right) = {H_{\rm MD}}\left( {{{\bf{\Gamma }}^k}} \right) + \sum\limits_{i = 1}^N {\frac{{{K_i}}}{2}{{\left( {s_i^k - {X_i}} \right)}^2}}
 \f]
 
-and \f$s^k_i\f$ is the \f$i\f$ th CV in the \f$k\f$th replica.
+and \f$s^k_i\f$ is the \f$i\f$th CV in the \f$k\f$th replica.
 
-\f$W_k\f$ comes from the Crooks-Jarzynski nonequilibrium work relation by which we can evaluate an equilibrium ensemble average from a set of nonequilibrium trajectories. Note that, to avoid possible numerical errors in the exponential function, the following form of \f$W_k\f$ is instead used in PLUMED,
+\f$W_k\f$ comes from the Crooks-Jarzynski non-equilibrium work relation by which we can evaluate an equilibrium ensemble average from a set of non-equilibrium trajectories. Note that, to avoid possible numerical errors in the exponential function, the following form of \f$W_k\f$ is instead used in PLUMED,
 
 \f[
  {W_k}\left( t \right) = \frac{{\exp \left[ { - \beta \left\{ {{w_k}\left( t \right) - {w_{\min }}\left( t \right)} \right\}} \right]}}{{\sum\nolimits_k {\exp \left[ { - \beta \left\{ {{w_k}\left( t \right) - {w_{\min }}\left( t \right)} \right\}} \right]} }}
@@ -149,21 +149,21 @@ where
 
 With the MF evaluated using the PD approach, reconstructing free energy profiles can be performed more efficiently (requiring less elapsed computing time) in LogPD than with a single MD system in LogMFD. In the case that there exists more than one stable state separated by high energy barriers in the hidden subspace orthogonal to the CV-subspace, LogPD is particularly of use to incorporate all the contributions from such hidden states with appropriate weights (in the limit \f$N_r\to\infty\f$ ).
 
-Note that LogPD calculations should always be initiated with an equilibrium \f${\bf q}\f$-configuration in each replica, because the Crooks-Jarzynski nonequilibrium work relation is invoked. Also note that LogPD is currently available only with Gromacs, while LogMFD can be performed with Lammps, Gromacs, and NAMD.
+Note that LogPD calculations should always be initiated with an equilibrium \f${\bf q}\f$-configuration in each replica, because the Crooks-Jarzynski non-equilibrium work relation is invoked. Also note that LogPD is currently available only with Gromacs, while LogMFD can be performed with LAMMPS, Gromacs, and NAMD.
 
-\section Thermostatted Thermostatted LogMFD/PD
+\section Thermostat Using LogMFD with a thermostat /PD
 
-Thermostatting of \f${\bf X}\f$ is often recommended in LogMFD/PD to maintain the adiabatic decoupling between \f${\bf q}\f$ and \f${\bf X}\f$. In the framework of the LogMFD approach, the Nose-Hoover type thermostat and the Gaussian isokinetic (velocity scaling) thermostat can be used to control the kinetic energy of \f${\bf X}\f$.
+Introducing a thermostat on \f${\bf X}\f$ is often recommended in LogMFD/PD to maintain the adiabatic decoupling between \f${\bf q}\f$ and \f${\bf X}\f$. In the framework of the LogMFD approach, the Nose-Hoover type thermostat and the Gaussian isokinetic (velocity scaling) thermostat can be used to control the kinetic energy of \f${\bf X}\f$.
 
 \subsection Nose-Hoover Nose-Hoover LogMFD/PD
 
-The EOM for \f$X_i\f$ coupled to a Nose-Hoover thermostat variable \f$\eta\f$ (single heat bath) is
+The equation of motion for \f$X_i\f$ coupled to a Nose-Hoover thermostat variable \f$\eta\f$ (single heat bath) is
 
 \f[
  {M_i}{\ddot X_i} =  - \left( {\frac{{\alpha \gamma }}{{\alpha F + 1}}} \right)\frac{{\partial F}}{{\partial {X_i}}} - {M_i}{\dot X_i}\dot \eta
 \f]
 
-The EOM for \f$\eta\f$ is
+The equation of motion for \f$\eta\f$ is
 
 \f[
  Q\ddot \eta  = \sum\limits_{i = 1}^N {\frac{{P_{{X_i}}^2}}{{{M_i}}} - N{k_B}T}
@@ -439,7 +439,7 @@ void LogMFD::registerKeywords(Keywords& keys) {
   keys.add("compulsory","THERMOSTAT",
            "Type of thermostat for the fictitious dynamical variables. NVE, NVT, VS are available." );
   keys.add("optional","TEMP",
-           "Temperature of the fictitious dynamical variables in thermostatted LogMFD/PD. "
+           "Temperature of the fictitious dynamical variables in LogMFD/PD thermostat. "
            "If not provided or provided as 0, it will be taken from the temperature of the MD system." );
 
   keys.add("optional","TAMD",
@@ -457,9 +457,9 @@ void LogMFD::registerKeywords(Keywords& keys) {
            "Spring constant of the harmonic restraining potential for the fictitious dynamical variables." );
 
   keys.add("compulsory","FICT_MAX",
-           "Maximam values reachable for the fictitious dynamical variables. The variables will elastically bounce back at the boundary (mirror boundary)." );
+           "Maximum values reachable for the fictitious dynamical variables. The variables will elastically bounce back at the boundary (mirror boundary)." );
   keys.add("compulsory","FICT_MIN",
-           "Minimam values reachable for the fictitious dynamical variables. The variables will elastically bounce back at the boundary (mirror boundary)." );
+           "Minimum values reachable for the fictitious dynamical variables. The variables will elastically bounce back at the boundary (mirror boundary)." );
 
   keys.add("optional","FICT",
            "The initial values of the fictitious dynamical variables. "
@@ -480,7 +480,7 @@ void LogMFD::registerKeywords(Keywords& keys) {
            "If not provided, it will be taken as 0." );
   keys.add("optional","META",
            "Mass of eta variable. "
-           "If not provided, it will be taken as N*kb*T*100*100." );
+           "If not provided, it will be taken as \\f$N*kb*T*100*100\\f$." );
 
   keys.add("compulsory","FLOG",
            "The initial free energy value in the LogMFD/PD run."
diff --git a/src/mapping/AdaptivePath.cpp b/src/mapping/AdaptivePath.cpp
index 546d75e57815ba24d3a87479589aa3e7f51102ee..f140c414f52b82f8e9d5e780bcc542e183ddb9ff 100644
--- a/src/mapping/AdaptivePath.cpp
+++ b/src/mapping/AdaptivePath.cpp
@@ -230,7 +230,8 @@ void AdaptivePath::update() {
     myspacings.reparameterize( fixedn[0], fixedn[1], tolerance );
   }
   if( (getStep()>0) && (getStep()%wstride==0) ) {
-    pathfile.printf("# PATH AT STEP %d TIME %f \n", getStep(), getTime() );
+    pathfile<<"# PATH AT STEP "<<getStep();
+    pathfile.printf(" TIME %f \n",getTime());
     std::vector<std::unique_ptr<ReferenceConfiguration>>& myconfs=getAllReferenceConfigurations();
     std::vector<SetupMolInfo*> moldat=plumed.getActionSet().select<SetupMolInfo*>();
     if( moldat.size()>1 ) error("you should only have one MOLINFO action in your input file");
diff --git a/src/multicolvar/CoordinationNumbers.cpp b/src/multicolvar/CoordinationNumbers.cpp
index abbf9e6f2c5a2328d080705ffe97b7171b131b0f..973ecb497c71c8630f2c87d72a38ddb62a5e6502 100644
--- a/src/multicolvar/CoordinationNumbers.cpp
+++ b/src/multicolvar/CoordinationNumbers.cpp
@@ -170,8 +170,8 @@ double CoordinationNumbers::compute( const unsigned& tindex, AtomValuePack& myat
       if(r_power > 0) {
         d = sqrt(d2); raised = pow( d, r_power - 1 );
         accumulateSymmetryFunction( 1, i, sw * raised * d,
-                                    (dfunc * d * raised + sw * r_power) * distance,
-                                    (-dfunc * d * raised - sw * r_power) * Tensor(distance, distance),
+                                    (dfunc * d * raised + sw * r_power * raised / d) * distance,
+                                    (-dfunc * d * raised - sw * r_power * raised / d) * Tensor(distance, distance),
                                     myatoms );
       } else {
         accumulateSymmetryFunction( 1, i, sw, (dfunc)*distance, (-dfunc)*Tensor(distance,distance), myatoms );
diff --git a/src/multicolvar/MultiColvarCombine.cpp b/src/multicolvar/MultiColvarCombine.cpp
index 12456c46d6afefbc8e7eb7c5c331d5298074736f..1ea5e0afcf4e4b0d9d1f1ff4c421c67b58561a91 100644
--- a/src/multicolvar/MultiColvarCombine.cpp
+++ b/src/multicolvar/MultiColvarCombine.cpp
@@ -56,7 +56,7 @@ void MultiColvarCombine::registerKeywords( Keywords& keys ) {
   MultiColvarBase::registerKeywords( keys );
   keys.add("compulsory","DATA","the multicolvars you are calculating linear combinations for");
   keys.add("compulsory","COEFFICIENTS","1.0","the coefficients to use for the various multicolvars");
-  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM"); keys.use("HISTOGRAM");
+  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM");
   keys.use("MIN"); keys.use("MAX"); keys.use("LOWEST"); keys.use("HIGHEST"); keys.use("ALT_MIN"); keys.use("BETWEEN"); keys.use("MOMENTS");
 }
 
diff --git a/src/multicolvar/MultiColvarProduct.cpp b/src/multicolvar/MultiColvarProduct.cpp
index 18a0c1822dcfb8ad831fe083a71d22a5418fcb16..32748fed6bb5c7db33f92a7c7856826161f2896e 100644
--- a/src/multicolvar/MultiColvarProduct.cpp
+++ b/src/multicolvar/MultiColvarProduct.cpp
@@ -54,7 +54,7 @@ PLUMED_REGISTER_ACTION(MultiColvarProduct,"MCOLV_PRODUCT")
 void MultiColvarProduct::registerKeywords( Keywords& keys ) {
   MultiColvarBase::registerKeywords( keys );
   keys.add("compulsory","DATA","the multicolvars you are calculating the product of");
-  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM"); keys.use("HISTOGRAM");
+  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM");
   keys.use("MIN"); keys.use("MAX"); keys.use("LOWEST"); keys.use("HIGHEST"); keys.use("ALT_MIN"); keys.use("BETWEEN"); keys.use("MOMENTS");
 }
 
diff --git a/src/multicolvar/VolumeAround.cpp b/src/multicolvar/VolumeAround.cpp
index 06bd7632f5258c345e492276ce9afd7a529619f9..bddef4b2e464c9e6500d2ff1d4eb4b5ed337d75d 100644
--- a/src/multicolvar/VolumeAround.cpp
+++ b/src/multicolvar/VolumeAround.cpp
@@ -25,8 +25,7 @@
 
 //+PLUMEDOC VOLUMES AROUND
 /*
-This quantity can be used to calculate functions of the distribution of collective
-variables for the atoms that lie in a particular, user-specified part of of the cell.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms that lie in a particular, user-specified part of of the cell.
 
 Each of the base quantities calculated by a multicolvar can can be assigned to a particular point in three
 dimensional space. For example, if we have the coordination numbers for all the atoms in the
diff --git a/src/multicolvar/VolumeBetweenContours.cpp b/src/multicolvar/VolumeBetweenContours.cpp
index ed00be25a07aec7a7d6ebe326784a4abdac17b09..edd3d3ef5c6290aa237909ec16d472a2892dc534 100644
--- a/src/multicolvar/VolumeBetweenContours.cpp
+++ b/src/multicolvar/VolumeBetweenContours.cpp
@@ -28,8 +28,7 @@
 
 //+PLUMEDOC VOLUMES INENVELOPE
 /*
-This quantity can be used to calculate functions of the distribution of collective
-variables for the atoms that lie in a region where the density of a certain type of atom is high.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms that lie in a region where the density of a certain type of atom is high.
 
 This collective variable can be used to determine whether colvars are within region where the density
 of a particular atom is high.  This is achieved by calculating the following function at the point where
diff --git a/src/multicolvar/VolumeCavity.cpp b/src/multicolvar/VolumeCavity.cpp
index f745e2bc372614e03b8fdcf640007f094b41ea07..65aaed7eca7bd575e82c6a7d06145fc33baa3037 100644
--- a/src/multicolvar/VolumeCavity.cpp
+++ b/src/multicolvar/VolumeCavity.cpp
@@ -28,8 +28,7 @@
 
 //+PLUMEDOC VOLUMES CAVITY
 /*
-This quantity can be used to calculate functions of the distribution of collective
-variables for the atoms that lie in a box defined by the positions of four atoms.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms that lie in a box defined by the positions of four atoms.
 
 Each of the base quantities calculated by a multicolvar can can be assigned to a particular point in three
 dimensional space. For example, if we have the coordination numbers for all the atoms in the
diff --git a/src/multicolvar/VolumeInCylinder.cpp b/src/multicolvar/VolumeInCylinder.cpp
index d56e61002fc5f0568c2f5fa53a50d71b4f191a0b..3c18c238f0c2e38cfdd14abc1f2e37b753370213 100644
--- a/src/multicolvar/VolumeInCylinder.cpp
+++ b/src/multicolvar/VolumeInCylinder.cpp
@@ -26,8 +26,7 @@
 
 //+PLUMEDOC VOLUMES INCYLINDER
 /*
-This quantity can be used to calculate functions of the distribution of collective
-variables for the atoms that lie in a particular, user-specified part of of the cell.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms that lie in a particular, user-specified part of of the cell.
 
 Each of the base quantities calculated by a multicolvar can can be assigned to a particular point in three
 dimensional space. For example, if we have the coordination numbers for all the atoms in the
diff --git a/src/multicolvar/VolumeInSphere.cpp b/src/multicolvar/VolumeInSphere.cpp
index db6b1266711e0f1fe22762039cbcd301427bb664..4f47e8ccd590c8d3158adc6c582d99da38ef1532 100644
--- a/src/multicolvar/VolumeInSphere.cpp
+++ b/src/multicolvar/VolumeInSphere.cpp
@@ -26,8 +26,7 @@
 
 //+PLUMEDOC VOLUMES INSPHERE
 /*
-This quantity can be used to calculate functions of the distribution of collective
-variables for the atoms that lie in a particular, user-specified part of of the cell.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms that lie in a particular, user-specified part of of the cell.
 
 Each of the base quantities calculated by a multicolvar can can be assigned to a particular point in three
 dimensional space. For example, if we have the coordination numbers for all the atoms in the
diff --git a/src/multicolvar/VolumeTetrapore.cpp b/src/multicolvar/VolumeTetrapore.cpp
index 3346188ec5e5af9a18841582d0a4042329cfb19a..70793a58391a7af7c4ba4b3574cb3b50e74eb544 100644
--- a/src/multicolvar/VolumeTetrapore.cpp
+++ b/src/multicolvar/VolumeTetrapore.cpp
@@ -28,8 +28,7 @@
 
 //+PLUMEDOC VOLUMES TETRAHEDRALPORE
 /*
-This quantity can be used to calculate functions of the distribution of collective variables
-for the atoms lie that lie in a box defined by the positions of four atoms at the corners of a tetrahedron.
+This quantity can be used to calculate functions of the distribution of collective variables for the atoms lie that lie in a box defined by the positions of four atoms at the corners of a tetrahedron.
 
 Each of the base quantities calculated by a multicolvar can can be assigned to a particular point in three
 dimensional space. For example, if we have the coordination numbers for all the atoms in the
diff --git a/src/multicolvar/XDistances.cpp b/src/multicolvar/XDistances.cpp
index 24e8da3609cdbc92f9e746312b775526f147427c..73c22269e8c88a8477218b3efe801684c9d41ca5 100644
--- a/src/multicolvar/XDistances.cpp
+++ b/src/multicolvar/XDistances.cpp
@@ -34,8 +34,7 @@ namespace multicolvar {
 //+PLUMEDOC MCOLVAR XDISTANCES
 /*
 Calculate the x components of the vectors connecting one or many pairs of atoms.
-You can then calculate functions of the distribution of
-values such as the minimum, the number less than a certain quantity and so on.
+You can then calculate functions of the distribution of values such as the minimum, the number less than a certain quantity and so on.
 
 \par Examples
 
@@ -81,8 +80,7 @@ PRINT ARG=d1.gt0.1
 //+PLUMEDOC MCOLVAR YDISTANCES
 /*
 Calculate the y components of the vectors connecting one or many pairs of atoms.
-You can then calculate functions of the distribution of
-values such as the minimum, the number less than a certain quantity and so on.
+You can then calculate functions of the distribution of values such as the minimum, the number less than a certain quantity and so on.
 
 \par Examples
 
@@ -129,8 +127,7 @@ PRINT ARG=d1.gt0.1
 //+PLUMEDOC MCOLVAR ZDISTANCES
 /*
 Calculate the z components of the vectors connecting one or many pairs of atoms.
-You can then calculate functions of the distribution of
-values such as the minimum, the number less than a certain quantity and so on.
+You can then calculate functions of the distribution of values such as the minimum, the number less than a certain quantity and so on.
 
 \par Examples
 
diff --git a/src/multicolvar/XYDistances.cpp b/src/multicolvar/XYDistances.cpp
index ec3800c6e196845d10d73f024d2c0a6cfe076851..f41f4c4db576d233e1d2a5e52a22202acd783619 100644
--- a/src/multicolvar/XYDistances.cpp
+++ b/src/multicolvar/XYDistances.cpp
@@ -34,8 +34,8 @@ namespace multicolvar {
 //+PLUMEDOC MCOLVAR XYDISTANCES
 /*
 Calculate distance between a pair of atoms neglecting the z-component.
-You can then calculate functions of the distribution of
- values such as the minimum, the number less than a certain quantity and so on.
+
+You can then calculate functions of the distribution of values such as the minimum, the number less than a certain quantity and so on.
 
 \par Examples
 
@@ -55,6 +55,7 @@ PRINT ARG=d1.min
 //+PLUMEDOC MCOLVAR XZDISTANCES
 /*
 Calculate distance between a pair of atoms neglecting the y-component.
+
 You can then calculate functions of the distribution of
 values such as the minimum, the number less than a certain quantity and so on.
 
@@ -76,6 +77,7 @@ PRINT ARG=d1.min
 //+PLUMEDOC MCOLVAR YZDISTANCES
 /*
 Calculate distance between a pair of atoms neglecting the x-component.
+
 You can then calculate functions of the distribution of
 values such as the minimum, the number less than a certain quantity and so on.
 
diff --git a/src/pamm/PAMM.cpp b/src/pamm/PAMM.cpp
index 2a6e727b281cdc50f9d95b4c95802390ab340e0b..40703b088821dca9a81d11e2305f3ff7939fb5c4 100644
--- a/src/pamm/PAMM.cpp
+++ b/src/pamm/PAMM.cpp
@@ -133,7 +133,7 @@ void PAMM::registerKeywords( Keywords& keys ) {
   keys.add("compulsory","DATA","the multicolvars from which the pamm coordinates are calculated");
   keys.add("compulsory","CLUSTERS","the name of the file that contains the definitions of all the clusters");
   keys.add("compulsory","REGULARISE","0.001","don't allow the denominator to be smaller then this value");
-  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM"); keys.use("HISTOGRAM");
+  keys.use("MEAN"); keys.use("MORE_THAN"); keys.use("SUM"); keys.use("LESS_THAN"); keys.use("HISTOGRAM");
   keys.use("MIN"); keys.use("MAX"); keys.use("LOWEST"); keys.use("HIGHEST"); keys.use("ALT_MIN"); keys.use("BETWEEN"); keys.use("MOMENTS");
   keys.setComponentsIntroduction("When the label of this action is used as the input for a second you are not referring to a scalar quantity as you are in "
                                  "regular collective variables.  The label is used to reference the full set of quantities calculated by "
diff --git a/src/pamm/PammObject.h b/src/pamm/PammObject.h
index aa719b80367f03aef465ab52da742d07b73cf622..fe212e90f5d6d35d3c8b8234da933203f3eaf99e 100644
--- a/src/pamm/PammObject.h
+++ b/src/pamm/PammObject.h
@@ -43,7 +43,9 @@ public:
 // Explicit definitions for constructor, copy constructor and destructor
   PammObject();
   PammObject( const PammObject& );
-  PammObject operator=(const PammObject& po) { plumed_error(); regulariser=po.regulariser; return PammObject(); }
+/// GB: I fixed this (should return PammObject&, it was returning PammObject
+// However I am not sure the implementation makes sense.
+  PammObject& operator=(const PammObject& po) { plumed_error(); regulariser=po.regulariser; return *this; }
 /// Setup the Pamm object
   void setup( const std::string& filename, const double& reg, const std::vector<std::string>& valnames,
               const std::vector<bool>& pbcin, const std::vector<std::string>& imin, const std::vector<std::string>& imax,
diff --git a/src/piv/PIV.cpp b/src/piv/PIV.cpp
index 732f61efee3fdcd5346f42f70ce6f0e50660d009..87054d75c91eadc517e9d37df0fd7af7e2016a94 100644
--- a/src/piv/PIV.cpp
+++ b/src/piv/PIV.cpp
@@ -701,7 +701,7 @@ PIV::PIV(const ActionOptions&ao):
           lmt1+=1;
         }
       }
-      log.printf("       |%10i|%15i|%15i|%15i|\n", j, rPIV[j].size(), lmt0, lmt1);
+      log.printf("       |%10i|%15zu|%15i|%15i|\n", j, rPIV[j].size(), lmt0, lmt1);
     }
   }
 
@@ -808,7 +808,7 @@ void PIV::calculate()
           lmt1+=1;
         }
       }
-      log.printf("       |%10i|%15i|%15i|%15i|\n", j, rPIV[j].size(), lmt0, lmt1);
+      log.printf("       |%10i|%15zu|%15i|%15i|\n", j, rPIV[j].size(), lmt0, lmt1);
     }
     log << "\n";
   }
diff --git a/src/setup/MolInfo.cpp b/src/setup/MolInfo.cpp
index 0d0191df1c6de36d659995a56cb55b75d03e8c59..bcc7ad1ef1b539caa40d86d5819349d618005b53 100644
--- a/src/setup/MolInfo.cpp
+++ b/src/setup/MolInfo.cpp
@@ -94,6 +94,10 @@ For protein residues, the following groups are available:
 @psi-#
 @omega-#
 @chi1-#
+@chi2-#
+@chi3-#
+@chi4-#
+@chi5-#
 \endverbatim
 
 that select the appropriate atoms that define each dihedral angle for residue #.
diff --git a/src/setup/Units.cpp b/src/setup/Units.cpp
index d0f92226fb0fb1a5d4fb6916b1b2d3008c463cb2..110ee19e59f35196a6b555376a4d40e682ba34af 100644
--- a/src/setup/Units.cpp
+++ b/src/setup/Units.cpp
@@ -32,7 +32,9 @@ namespace setup {
 
 //+PLUMEDOC GENERIC UNITS
 /*
-This command sets the internal units for the code.  A new unit can be set by either
+This command sets the internal units for the code.
+
+A new unit can be set by either
 specifying a conversion factor from the plumed default unit or by using a string
 corresponding to one of the defined units given below.  This directive MUST
 appear at the BEGINNING of the plumed.dat file.  The same units must be used
diff --git a/src/tools/MolDataClass.cpp b/src/tools/MolDataClass.cpp
index 11fa1ecf7714aae4dabb087c17d9fec020a97470..76ebd5e81763d8c977f39cf32b66e7b5cbb42c56 100644
--- a/src/tools/MolDataClass.cpp
+++ b/src/tools/MolDataClass.cpp
@@ -324,21 +324,81 @@ void MolDataClass::specialSymbol( const std::string& type, const std::string& sy
         numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("N",resnum+1,chainid));
         numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CA",resnum+1,chainid));
       } else if( name=="chi1" && !isTerminalGroup("protein",resname) ) {
-        if ( resname=="GLY" || resname=="ALA" || resname=="SFO" ) plumed_merror("chi-1 is not defined for Alanine, Glycine and SFO");
+        if ( resname=="GLY" || resname=="ALA" || resname=="SFO" ) plumed_merror("chi-1 is not defined for ALA, GLY and SFO");
         numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("N",resnum,chainid));
         numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CA",resnum,chainid));
         numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CB",resnum,chainid));
-        if(resname=="ILE"||resname=="VAL")
+        if(resname=="ILE"||resname=="VAL") {
           numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG1",resnum,chainid));
-        else if(resname=="CYS")
+        } else if(resname=="CYS") {
           numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("SG",resnum,chainid));
-        else if(resname=="THR")
+        } else if(resname=="THR") {
           numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("OG1",resnum,chainid));
-        else if(resname=="SER")
+        } else if(resname=="SER") {
           numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("OG",resnum,chainid));
-        else
+        } else {
           numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG",resnum,chainid));
+        }
+      } else if( name=="chi2" && !isTerminalGroup("protein",resname) ) {
+        if ( resname=="GLY" || resname=="ALA" || resname=="SFO" || resname=="CYS" || resname=="SER" ||
+             resname=="THR" || resname=="VAL" ) plumed_merror("chi-2 is not defined for ALA, GLY, CYS, SER, THR, VAL and SFO");
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CA",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CB",resnum,chainid));
+
+        if(resname=="ILE") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG1",resnum,chainid));
+        } else {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG",resnum,chainid));
+        }
+        if(resname=="ASN" || resname=="ASP") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("OD1",resnum,chainid));
+        } else if(resname=="HIS") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("ND1",resnum,chainid));
+        } else if(resname=="LEU" || resname=="PHE" || resname=="TRP" || resname=="TYR") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CD1",resnum,chainid));
+        } else if(resname=="MET") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("SD",resnum,chainid));
+        } else {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CD",resnum,chainid));
+        }
+      } else if( name=="chi3" && !isTerminalGroup("protein",resname) ) {
+        if (!( resname=="ARG" || resname=="GLN" || resname=="GLU" || resname=="LYS" ||
+               resname=="MET" )) plumed_merror("chi-3 is defined only for ARG, GLN, GLU, LYS and MET");
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CB",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG",resnum,chainid));
+
+        if(resname=="MET") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("SD",resnum,chainid));
+        } else {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CD",resnum,chainid));
+        }
+        if(resname=="GLN" || resname=="GLU") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("OE1",resnum,chainid));
+        } else if(resname=="LYS" || resname=="MET") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CE",resnum,chainid));
+        } else {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("NE",resnum,chainid));
+        }
+      } else if( name=="chi4" && !isTerminalGroup("protein",resname) ) {
+        if (!( resname=="ARG" || resname=="LYS" )) plumed_merror("chi-4 is defined only for ARG and LYS");
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CG",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CD",resnum,chainid));
+
+        if(resname=="ARG") {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("NE",resnum,chainid));
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CZ",resnum,chainid));
+        } else {
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CE",resnum,chainid));
+          numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("NZ",resnum,chainid));
+        }
+      } else if( name=="chi5" && !isTerminalGroup("protein",resname) ) {
+        if (!( resname=="ARG" )) plumed_merror("chi-5 is defined only for ARG");
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CD",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("NE",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("CZ",resnum,chainid));
+        numbers.push_back(mypdb.getNamedAtomFromResidueAndChain("NH1",resnum,chainid));
       } else numbers.push_back(mypdb.getNamedAtomFromResidueAndChain(name,resnum,chainid));
+
     } else if( allowedResidue("rna",resname) || allowedResidue("dna",resname)) {
       std::string basetype;
       if(resname.find_first_of("A")!=std::string::npos) basetype+="A";
diff --git a/src/vatom/Ghost.cpp b/src/vatom/Ghost.cpp
index cf7b86651ddba7f5d45ede598d7e0265179ffc70..27454a09e60e4349418ee6e5b01ccf9bbf07f2c8 100644
--- a/src/vatom/Ghost.cpp
+++ b/src/vatom/Ghost.cpp
@@ -31,8 +31,7 @@ namespace vatom {
 
 //+PLUMEDOC VATOM GHOST
 /*
-Calculate the absolute position of a ghost atom with fixed coordinates
-in the local reference frame formed by three atoms.
+Calculate the absolute position of a ghost atom with fixed coordinates in the local reference frame formed by three atoms.
 
 The computed ghost atom is stored as a virtual atom that can be accessed in
  an atom list through the the label for the GHOST action that creates it.
diff --git a/src/ves/BF_Custom.cpp b/src/ves/BF_Custom.cpp
index 19edeebf570a17bc1bf3be8d5956777d7de907fa..23420f6925fc7db17a2cee0983120c6c10ce4b4f 100644
--- a/src/ves/BF_Custom.cpp
+++ b/src/ves/BF_Custom.cpp
@@ -106,8 +106,12 @@ class BF_Custom : public BasisFunctions {
 private:
   lepton::CompiledExpression transf_value_expression_;
   lepton::CompiledExpression transf_deriv_expression_;
+  double* transf_value_lepton_ref_;
+  double* transf_deriv_lepton_ref_;
   std::vector<lepton::CompiledExpression> bf_values_expressions_;
   std::vector<lepton::CompiledExpression> bf_derivs_expressions_;
+  std::vector<double*> bf_values_lepton_ref_;
+  std::vector<double*> bf_derivs_lepton_ref_;
   std::string variable_str_;
   std::string transf_variable_str_;
   bool do_transf_;
@@ -134,8 +138,12 @@ void BF_Custom::registerKeywords(Keywords& keys) {
 
 BF_Custom::BF_Custom(const ActionOptions&ao):
   PLUMED_VES_BASISFUNCTIONS_INIT(ao),
+  transf_value_lepton_ref_(nullptr),
+  transf_deriv_lepton_ref_(nullptr),
   bf_values_expressions_(0),
   bf_derivs_expressions_(0),
+  bf_values_lepton_ref_(0,nullptr),
+  bf_derivs_lepton_ref_(0,nullptr),
   variable_str_("x"),
   transf_variable_str_("t"),
   do_transf_(false),
@@ -172,6 +180,8 @@ BF_Custom::BF_Custom(const ActionOptions&ao):
   //
   bf_values_expressions_.resize(getNumberOfBasisFunctions());
   bf_derivs_expressions_.resize(getNumberOfBasisFunctions());
+  bf_values_lepton_ref_.resize(getNumberOfBasisFunctions());
+  bf_derivs_lepton_ref_.resize(getNumberOfBasisFunctions());
   //
   for(unsigned int i=1; i<getNumberOfBasisFunctions(); i++) {
     std::string is; Tools::convert(i,is);
@@ -206,6 +216,14 @@ BF_Custom::BF_Custom(const ActionOptions&ao):
       plumed_merror("There was some problem in parsing the derivative of the function "+bf_str[i]+" given in FUNC"+is + " with lepton");
     }
 
+    try {
+      bf_values_lepton_ref_[i] = &bf_values_expressions_[i].getVariableReference(variable_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+
+    try {
+      bf_derivs_lepton_ref_[i] = &bf_derivs_expressions_[i].getVariableReference(variable_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+
   }
 
   std::string transf_value_parsed;
@@ -254,6 +272,15 @@ BF_Custom::BF_Custom(const ActionOptions&ao):
     catch(PLMD::lepton::Exception& exc) {
       plumed_merror("There was some problem in parsing the derivative of the function "+transf_str+" given in TRANSFORM with lepton");
     }
+
+    try {
+      transf_value_lepton_ref_ = &transf_value_expression_.getVariableReference(transf_variable_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+
+    try {
+      transf_deriv_lepton_ref_ = &transf_deriv_expression_.getVariableReference(transf_variable_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+
   }
   //
   log.printf("  Using the following functions [lepton parsed function and derivative]:\n");
@@ -290,19 +317,12 @@ void BF_Custom::getAllValues(const double arg, double& argT, bool& inside_range,
   double transf_derivf=1.0;
   //
   if(do_transf_) {
-    // has to copy as the function is const
-    lepton::CompiledExpression ce_value = transf_value_expression_;
-    try {
-      ce_value.getVariableReference(transf_variable_str_) = argT;
-    } catch(PLMD::lepton::Exception& exc) {}
 
-    lepton::CompiledExpression ce_deriv = transf_deriv_expression_;
-    try {
-      ce_deriv.getVariableReference(transf_variable_str_) = argT;
-    } catch(PLMD::lepton::Exception& exc) {}
+    if(transf_value_lepton_ref_) {*transf_value_lepton_ref_ = argT;}
+    if(transf_deriv_lepton_ref_) {*transf_deriv_lepton_ref_ = argT;}
 
-    argT = ce_value.evaluate();
-    transf_derivf = ce_deriv.evaluate();
+    argT = transf_value_expression_.evaluate();
+    transf_derivf = transf_deriv_expression_.evaluate();
 
     if(check_nan_inf_ && (std::isnan(argT) || std::isinf(argT)) ) {
       std::string vs; Tools::convert(argT,vs);
@@ -318,17 +338,13 @@ void BF_Custom::getAllValues(const double arg, double& argT, bool& inside_range,
   values[0]=1.0;
   derivs[0]=0.0;
   for(unsigned int i=1; i < getNumberOfBasisFunctions(); i++) {
-    lepton::CompiledExpression ce_value = bf_values_expressions_[i];
-    try {
-      ce_value.getVariableReference(variable_str_) = argT;
-    } catch(PLMD::lepton::Exception& exc) {}
-    values[i] = ce_value.evaluate();
 
-    lepton::CompiledExpression ce_deriv = bf_derivs_expressions_[i];
-    try {
-      ce_deriv.getVariableReference(variable_str_) = argT;
-    } catch(PLMD::lepton::Exception& exc) {}
-    derivs[i] = ce_deriv.evaluate();
+    if(bf_values_lepton_ref_[i]) {*bf_values_lepton_ref_[i] = argT;}
+    if(bf_derivs_lepton_ref_[i]) {*bf_derivs_lepton_ref_[i] = argT;}
+
+    values[i] = bf_values_expressions_[i].evaluate();
+    derivs[i] = bf_derivs_expressions_[i].evaluate();
+
     if(do_transf_) {derivs[i]*=transf_derivf;}
     // NaN checks
     if(check_nan_inf_ && (std::isnan(values[i]) || std::isinf(values[i])) ) {
diff --git a/src/ves/TD_Custom.cpp b/src/ves/TD_Custom.cpp
index e40c6e02a35c5c956b3709d01f6783928e8b7873..949e6d8f0dc0bc9331f6a1773fafc7f35cc2e9f8 100644
--- a/src/ves/TD_Custom.cpp
+++ b/src/ves/TD_Custom.cpp
@@ -119,6 +119,11 @@ private:
   //
   lepton::CompiledExpression expression;
   //
+  std::vector<double*> cv_var_lepton_refs_;
+  double* kbt_var_lepton_ref_;
+  double* beta_var_lepton_ref_;
+  double* fes_var_lepton_ref_;
+  //
   std::vector<unsigned int> cv_var_idx_;
   std::vector<std::string> cv_var_str_;
   //
@@ -151,6 +156,11 @@ void TD_Custom::registerKeywords(Keywords& keys) {
 
 TD_Custom::TD_Custom(const ActionOptions& ao):
   PLUMED_VES_TARGETDISTRIBUTION_INIT(ao),
+//
+  cv_var_lepton_refs_(0,nullptr),
+  kbt_var_lepton_ref_(nullptr),
+  beta_var_lepton_ref_(nullptr),
+  fes_var_lepton_ref_(nullptr),
 //
   cv_var_idx_(0),
   cv_var_str_(0),
@@ -201,10 +211,31 @@ TD_Custom::TD_Custom(const ActionOptions& ao):
   //
   std::sort(cv_var_idx_.begin(),cv_var_idx_.end());
   cv_var_str_.resize(cv_var_idx_.size());
+  cv_var_lepton_refs_.resize(cv_var_str_.size());
   for(unsigned int j=0; j<cv_var_idx_.size(); j++) {
     std::string str1; Tools::convert(cv_var_idx_[j]+1,str1);
     cv_var_str_[j] = cv_var_prefix_str_+str1;
+    try {
+      cv_var_lepton_refs_[j] = &expression.getVariableReference(cv_var_str_[j]);
+    } catch(PLMD::lepton::Exception& exc) {}
+  }
+
+  if(use_kbt_) {
+    try {
+      kbt_var_lepton_ref_ = &expression.getVariableReference(kbt_var_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+  }
+  if(use_beta_) {
+    try {
+      beta_var_lepton_ref_ = &expression.getVariableReference(beta_var_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
+  }
+  if(use_fes_) {
+    try {
+      fes_var_lepton_ref_ = &expression.getVariableReference(fes_var_str_);
+    } catch(PLMD::lepton::Exception& exc) {}
   }
+
 }
 
 
@@ -226,14 +257,10 @@ void TD_Custom::updateGrid() {
     plumed_massert(getFesGridPntr()!=NULL,"the FES grid has to be linked to the free energy in the target distribution");
   }
   if(use_kbt_) {
-    try {
-      expression.getVariableReference(kbt_var_str_) = 1.0/getBeta();
-    } catch(PLMD::lepton::Exception& exc) {}
+    if(kbt_var_lepton_ref_) {*kbt_var_lepton_ref_= 1.0/getBeta();}
   }
   if(use_beta_) {
-    try {
-      expression.getVariableReference(beta_var_str_) = getBeta();
-    } catch(PLMD::lepton::Exception& exc) {}
+    if(beta_var_lepton_ref_) {*beta_var_lepton_ref_= getBeta();}
   }
   //
   std::vector<double> integration_weights = GridIntegrationWeights::getIntegrationWeights(getTargetDistGridPntr());
@@ -242,14 +269,10 @@ void TD_Custom::updateGrid() {
   for(Grid::index_t l=0; l<targetDistGrid().getSize(); l++) {
     std::vector<double> point = targetDistGrid().getPoint(l);
     for(unsigned int k=0; k<cv_var_str_.size() ; k++) {
-      try {
-        expression.getVariableReference(cv_var_str_[k]) = point[cv_var_idx_[k]];
-      } catch(PLMD::lepton::Exception& exc) {}
+      if(cv_var_lepton_refs_[k]) {*cv_var_lepton_refs_[k] = point[cv_var_idx_[k]];}
     }
     if(use_fes_) {
-      try {
-        expression.getVariableReference(fes_var_str_) = getFesGridPntr()->getValue(l);
-      } catch(PLMD::lepton::Exception& exc) {}
+      if(fes_var_lepton_ref_) {*fes_var_lepton_ref_ = getFesGridPntr()->getValue(l);}
     }
     double value = expression.evaluate();
 
diff --git a/src/ves/TD_ProductDistribution.cpp b/src/ves/TD_ProductDistribution.cpp
index 48785831ea070048c2909446510b44dc12782e3b..254b581739a5cfed2b0227a97c66628f53600d82 100644
--- a/src/ves/TD_ProductDistribution.cpp
+++ b/src/ves/TD_ProductDistribution.cpp
@@ -33,8 +33,7 @@ namespace ves {
 
 //+PLUMEDOC VES_TARGETDIST TD_PRODUCT_DISTRIBUTION
 /*
-Target distribution given by a separable product
-of one-dimensional distributions (static or dynamic).
+Target distribution given by a separable product of one-dimensional distributions (static or dynamic).
 
 Employ a target distribution that is a separable product
 of one-dimensional distributions, defined as
diff --git a/user-doc/Installation.md b/user-doc/Installation.md
index 8497f60bc36c123ddcbac176d30762ef60db161d..0b31786b20a75caf9c837d32ec0c90b1f1f42803 100644
--- a/user-doc/Installation.md
+++ b/user-doc/Installation.md
@@ -690,7 +690,7 @@ There are two ways to install Python wrappers.
 
 \subsection installingpython-inside Installing Python wrappers within PLUMED
 
-If `./configure` finds a `python` executable that also has the modules `numpy` and `cython` available, Python wrappers will be installed within `/prefix/lib/plumed/python`. In order to access them, you should add this directory to the environment variable `PYTHONPATH`. Notice that if your python interpreter has a different name you might have to pass it to `./configure` with `PYTHON_BIN=python3.6`. The whole thing would then be:
+If `./configure` finds a `python` executable that also has the `cython` module available, Python wrappers will be installed within `/prefix/lib/plumed/python`. In order to access them, you should add this directory to the environment variable `PYTHONPATH`. Notice that if your python interpreter has a different name you might have to pass it to `./configure` with `PYTHON_BIN=python3.6`. The whole thing would then be:
 
 ````
 ./configure PYTHON_BIN=python3.6 --prefix=$HOME/opt
@@ -707,7 +707,6 @@ Notice that in this manner you will have to commit to a specific python version
 If you use multiple python versions, you might find it easier to install the Python wrappers separately from PLUMED. The simplest way is to do it with `pip`:
 
 ````
-pip3.6 install --user numpy
 pip3.6 install --user plumed
 ````
 
@@ -724,8 +723,7 @@ If you want to install using pip the development version of the wrappers you sho
 the following commands:
 
 ````
-pip3.6 install --user cython # also cython is required in this case
-pip3.6 install --user numpy
+pip3.6 install --user cython # cython is required in this case
 cd plumed2/python
 make pip
 pip3.6 install --user .
diff --git a/user-doc/LOGMFDMOD.md b/user-doc/LOGMFDMOD.md
index 0f0934676b46e295012f7b7a9abf211a1823734c..08eaefc77c618a4c4c6af69eecbe3b80a3c120cd 100644
--- a/user-doc/LOGMFDMOD.md
+++ b/user-doc/LOGMFDMOD.md
@@ -8,7 +8,7 @@ reference: \cite MorishitaLogMFD \cite MorishitaLogPD \cite MorishitaVsLogMFD
 
 ## Overview
 
-The LOGMFD module contains the LogMFD/LogPD method for enhanced sampling in a CV space and for on-the-fly free energy reconstruction along the CVs. This module implements the multiple-replica algorithm (LogPD \cite MorishitaLogPD) as well as the single-replica algorithm (LogMFD \cite MorishitaLogMFD), the former invoking the Crooks-Jarzynski nonequilibrium work relation. In addition, TAMD/d-AFED \cite AbramsJ2008 can also be implemented by this module.
+The LOGMFD module contains the LogMFD/LogPD method for enhanced sampling in a CV space and for on-the-fly free energy reconstruction along the CVs. This module implements the multiple-replica algorithm (LogPD \cite MorishitaLogPD) as well as the single-replica algorithm (LogMFD \cite MorishitaLogMFD), the former invoking the Crooks-Jarzynski non-equilibrium work relation. In addition, TAMD/d-AFED \cite AbramsJ2008 can also be implemented by this module.
 
 ## Installation 
 This module is not installed by default. Add '\-\-enable-modules=logmfd' to your './configure' command when building PLUMED to enable these features.
diff --git a/user-doc/Performances.md b/user-doc/Performances.md
index 7dac6c28695ae78c8ef57f01aa51de9fd496e9ee..20c0fb984e83094b599ebe3b7e932c1dda095605 100644
--- a/user-doc/Performances.md
+++ b/user-doc/Performances.md
@@ -278,20 +278,20 @@ function. For instance, with an input like this one:
 \plumedfile
 ...
 c: COORDINATION GROUPA=1-108 GROUPB=1-108 R_0=1
-dfast: COORDINATION GROUPA=1-108 GROUPB=1-108 SWITCH={CUSTOM FUNC=1/(1+x2^3) R_0=1}
+d_fast: COORDINATION GROUPA=1-108 GROUPB=1-108 SWITCH={CUSTOM FUNC=1/(1+x2^3) R_0=1}
 ...
 \endplumedfile
 I (GB) obtained the following timings (on a Macbook laptop):
 \verbatim
 ...
 PLUMED: 4A  1 c                                          108     0.126592     0.001172     0.000701     0.002532
-PLUMED: 4A  2 dfast                                      108     0.135210     0.001252     0.000755     0.002623
+PLUMED: 4A  2 d_fast                                      108     0.135210     0.001252     0.000755     0.002623
 ...
 \endverbatim
 
 Notice the usage of `x2` as a variable for the switching function (see \ref switchingfunction), which
 avoids an unnecessary square root calculation (this is done automatically by the hard-coded switching functions
-when you use only even powers). The asmjit calculation (`dfast`) takes less than 10% more than the hard-coded
+when you use only even powers). The asmjit calculation (`d_fast`) takes less than 10% more than the hard-coded
 one (`c`).
 
 \page Time Time your Input
diff --git a/user-doc/bibliography.bib b/user-doc/bibliography.bib
index bd32c86b056751538c68632b78216175ff4ce401..7e965a2b17c4dba9b947a799cba97627a0b2af96 100644
--- a/user-doc/bibliography.bib
+++ b/user-doc/bibliography.bib
@@ -2870,4 +2870,15 @@ pages = {D357--63},
 month = jan
 }
 
+@article{Paissoni:2019ee,
+author = {Paissoni, Cristina and Jussupow, Alexander and Camilloni, Carlo},
+title = {{Martini bead form factors for nucleic acids and their application in the refinement of protein{\textendash}nucleic acid complexes against SAXS data}},
+journal = {J Appl Crystallogr},
+year = {2019},
+volume = {52},
+number = {2},
+pages = {394--402},
+month = apr
+}
+
 @Comment{jabref-meta: databaseType:bibtex;}
diff --git a/user-doc/figs/lyon-histograms-highT.png b/user-doc/figs/lyon-histograms-highT.png
new file mode 100644
index 0000000000000000000000000000000000000000..717fd91301c9a129520e1789757c42f544bef207
Binary files /dev/null and b/user-doc/figs/lyon-histograms-highT.png differ
diff --git a/user-doc/figs/lyon-histograms-lowT.png b/user-doc/figs/lyon-histograms-lowT.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc02cd67090660eb5a32d35c06d21244360d690a
Binary files /dev/null and b/user-doc/figs/lyon-histograms-lowT.png differ
diff --git a/user-doc/figs/lyon-lj7-minima.png b/user-doc/figs/lyon-lj7-minima.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc88b5270625d94be3a39b7c2f01a14b76e4774b
Binary files /dev/null and b/user-doc/figs/lyon-lj7-minima.png differ
diff --git a/user-doc/figs/lyon-time-series.png b/user-doc/figs/lyon-time-series.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee52ed6b3d543226e44e2b7f838c00b9feafb3
Binary files /dev/null and b/user-doc/figs/lyon-time-series.png differ
diff --git a/user-doc/spelling_words.dict b/user-doc/spelling_words.dict
index ceaa584fdd598a33c33f7ca2146069a4c0bb0628..496ca1c66f21c0fcfd22ebf232e40edc8436b701 100644
--- a/user-doc/spelling_words.dict
+++ b/user-doc/spelling_words.dict
@@ -893,3 +893,36 @@ fullerene
 fpt
 ECDF
 CDF
+Tetsuya
+Morishita
+Naoki
+Watanabe
+LogMFD
+AFED
+LogPD
+LOGMFDMOD
+PIVMOD
+logmfd
+MFD
+adiabatically
+isokinetic
+MFD
+NHILLS
+Fiskissimo
+Simoes
+Iglesias
+Javi
+Silvio
+Pipolo
+Fabio
+ArrayFire
+ESDW
+CECAM
+lyon
+Wallclock
+Ferrarotti
+cunha
+Paissoni
+samplextc
+SASDAB
+SASDB
diff --git a/user-doc/tutorials/lyon.txt b/user-doc/tutorials/lyon.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8df3e819d1455c27d623188983f19377054b4a82
--- /dev/null
+++ b/user-doc/tutorials/lyon.txt
@@ -0,0 +1,351 @@
+/**
+\page lyon ESDW Workshop Lyon 2019: A very simple introduction to PLUMED 
+
+\section lyon-aims Aims
+
+The aim of this tutorial is to give you a quick introduction to the PLUMED syntax and to some of the types of calculations that can be done with PLUMED.
+It was prepared for a two and a half hour session on PLUMED in the following CECAM extended software development workshop on topics in classical MD:
+
+https://www.cecam.org/workshop-1802.html 
+
+This tutorial is very straightforward and not at all exhaustive.  In fact it skims over many of the subtleties associated with performing these types of 
+calculations using PLUMED.  We have thus suggested additional topics that you might like to look into throughout the tutorial in order to take this further.  
+If you are interested in using the techniques described in the tutorial in your own work a good starting point might be to read the chapter we wrote about using 
+metadynamics from within PLUMED:
+
+https://arxiv.org/abs/1812.08213
+
+\section lyon-objectives Objectives
+
+Once this tutorial is completed students will be able to:
+
+- Run simulations using the internal PLUMED MD engine simplemd.
+- Use PLUMED to add additional forces in order to prevent clusters from dissociating.
+- Use PLUMED to calculate and \ref PRINT the value of a collective variable.
+- Use the PLUMED \ref HISTOGRAM and \ref CONVERT_TO_FES actions to calculate an estimate of the free energy as a function of some collective variables.
+- Run metadynamics using the action \ref METAD in order to enhance sampling along collective variables of interest.
+
+\section lyon-resources Resources
+
+The \tarball{lyon} for this project contains the following files:
+
+- in : The input file for simplemd that contains the parameters for the MD simulation.
+- input.xyz : An initial configuration for the cluster that we are studying in this tutorial.
+- plumed.dat : An empty input file for PLUMED 
+
+This tutorial has been tested on v2.5 but it should also work with other versions of PLUMED.
+
+Also notice that the `.solutions` direction of the tarball contains correct input files for the exercises.
+Please only look at these files once you have tried to solve the problems yourself.  Similarly the tutorial 
+below contains questions for you to answer that are shown in bold.  You can reveal the answers to these questions
+by clicking on links within the tutorial but you should obviously try to answer things yourself before reading these
+answers.
+
+\section lyon-intro Introduction
+
+In this tutorial we are going to study a very simple physical system; namely, seven Lennard Jones atoms in a two dimensional space.   This simple system has been 
+extensively studied as has often been used to benchmark new simulation techniques.  In addition, the potential energy landscape has been fully characterized and it is 
+known that only the four structurally-distinct minima shown below exist: 
+
+\anchor lyon-lj7-minima
+\image html lyon-lj7-minima.png "The four energetic minima in the energy landscape for two-dimensional Lennard Jones 7."
+
+In the exercises that follow we are going to learn how to use PLUMED to determine the relative free energies of these four structures by running molecular dynamics simulations.  
+
+\section lyon-exercises Exercises 
+
+Before completing the following exercises you will need to download and install PLUMED.  In addition, you will need to ensure that the PLUMED executable can be found within your path.  
+You can download PLUMED from:
+
+http://www.plumed.org/get-it    or    https://github.com/plumed/plumed2
+
+If you are running PLUMED on a Mac you can install the executable directly using macports and the command:
+
+\verbatim
+sudo port install plumed
+\endverbatim
+
+If you download the source code you can find instructions on compilation in the PLUMED page on \ref Installation.  If you run make install PLUMED or install with macports PLUMED will be within your path.  
+If, however, you do not wish to do a make install you can simple make PLUMED.  You can then add the location of the PLUMED executable to your path by issuing the command:
+
+\verbatim
+source sourceme.sh
+\endverbatim
+
+from within the plumed2 directory.
+
+\subsection lyon-simplemd Using PLUMED as an MD code
+
+PLUMED is primarily used as a plugin to other larger molecular dynamics codes.  Mainly for software testing and educational purposes, however, we do have an MD code with a rather
+limited set of functionalities within PLUMED.  You can run this code by issuing the command:
+
+\verbatim
+plumed simplemd < in 
+\endverbatim
+
+where in here is the input file from the tar ball for this tutorial, which is shown below:
+
+\verbatim
+nputfile input.xyz
+outputfile output.xyz
+temperature 0.5
+tstep 0.005
+friction 0.1
+forcecutoff 2.5
+listcutoff  3.0
+ndim 2
+nstep 200000
+nconfig 1000 trajectory.xyz
+nstat   1000 energies.dat
+\endverbatim
+
+This input instructs PLUMED to perform 200000 steps of MD at a temperature of \f$k_B T = 0.5 \epsilon\f$ starting from the configuration in input.xyz.  The timestep in this simulation 
+is 0.005 \f$\sqrt{\epsilon}{m\sigma^2}\f$ and the temperature is kept fixed using a Langevin thermostat with a relaxation time of \f$0.1 \sqrt{\epsilon}{m\sigma^2}\f$.  Trajectory frames
+are output every 1000 MD steps to a file called trajectory.xyz.  Notice also that in order to run the calculation above you need to provide an empty file called plumed.dat.  This file
+is the input file to the PLUMED plugin itself, which, because this file is empty, is doing nothing when we run the calculation above.
+
+<b> Run a calculation using simplemd and the input above and visualize the trajectory that is output.  Describe what happens during this calculation and explain why this is happening. </b>
+
+\hidden{What happens}
+You can visualize what occurs during the trajectory by using a visualization package such as VMD (https://www.ks.uiuc.edu/Research/vmd/).  If you are using VMD you can see the MD trajectory
+by using the command:
+
+\verbatim
+vmd trajectory.xyz
+\endverbatim
+
+You should observe that all the atoms fly apart early on in the simulation and that the cluster evaporates.  The cluster evaporates because at a temperature of \f$k_B T = 0.5 \epsilon\f$ the gas
+state has a lower free energy than than the cluster state.
+\endhidden  
+
+<b> Change the parameters in the input file for simplemd so as to prevent the cluster from evaporating. </b>
+
+\hidden{No evaporation}
+To prevent the cluster from evaporating you need to lower the temperature in the file in.  The cluster will not evaporate if the temperature is set equal to \f$k_B T = 0.2 \epsilon\f$.
+\endhidden
+
+\subsection lyon-force Using PLUMED to specify a force field
+
+If we lower the temperature of the simulation very little will happen.  Yes the cluster will no longer evaporate but at the same time we will not see any transitions between the various basins in this 
+energy landscape.  In this section we are thus going to see how we can use PLUMED to add a bias potential that can prevent the cluster from exploring gaseous configurations that do not interest us.  In other
+words, we are going to see how we can use PLUMED to add restraints that will prevent the cluster from evaporating.  Before getting on to that, however, we first need to understand a little about how the PLUMED
+input file works.  As explained in \ref trieste-1-structure and \ref Syntax when you write a PLUMED input file you are writing a script that calculates functions of the atomic positions.  
+
+<b> Read the description of the PLUMED input syntax in the linked pages in the paragraph above and then attempt the exercise that follows </b>
+
+We are going to prevent the cluster from evaporating by adding a restraint on all the atoms so as to prevent them from moving more than \f$2\sigma\f$ from the center of mass of the cluster.  As the masses 
+of all the atoms in the cluster are the same we can compute the position of the center of mass using:
+\f[
+\mathbf{x}_\textrm{com} = \frac{1}{N} \sum_{i=1}^N \mathbf{x}_i
+\f]
+where \f$\mathbf{x}_i\f$ is the position of the \f$i\f$th atom.   The distance between the \f$i\f$th atom and the position of this center of mass, \f$d_i\f$, can be computed using Pythagoras' theorem.  These distances
+are then restrained by using the following potential:
+\f[
+V(d_i) = \begin{cases}
+          100*(d_i-2.0)^2 & \textrm{if} \quad d_i > 2 \\
+          0 & \textrm{otherwise}
+\end{cases}
+\f] 
+as you can see this potential has no effect on the dynamics when these distances are less than 2 \f$\epsilon\f$.  If any atom is more than 2 \f$\epsilon\f$ from the center of mass, however, this potential will drive it back
+towards the center of mass.   The following cell contains a skeleton input file for PLUMED that gets it to calculate and apply this bias potential.  
+
+\plumedfile
+# this optional command tells VIM that this is a PLUMED file and to color the text accordingly
+# vim: ft=plumed
+
+# This tells PLUMED we are using Lennard Jones units
+UNITS NATURAL
+
+# Calculate the position of the center of mass.  We can then refer to this position later in the input using the label com.
+com: COM __FILL__ 
+
+# Add the restraint on the distance between com and the first atom
+d1: DISTANCE __FILL__ 
+UPPER_WALLS ARG=d1 __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d2: DISTANCE __FILL__
+UPPER_WALLS  __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d3: DISTANCE __FILL__
+UPPER_WALLS __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d4: DISTANCE __FILL__
+UPPER_WALLS __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d5: DISTANCE __FILL__
+UPPER_WALLS __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d6: DISTANCE __FILL__
+UPPER_WALLS __FILL__
+
+# Add the restraint on the distance between com and the first atom
+d7: DISTANCE __FILL__
+UPPER_WALLS __FILL__
+\endplumedfile
+
+<b> Copy and paste the content above into the file plumed.dat and then fill in the blanks by looking up the documentation for these actions online and by reading the description of the calculation that you are to run above.
+Once you have got a working plumed.dat file run a calculation using simplemd again at a temperature of \f$k_B T = 0.2 \epsilon\f$ and check to see if the bias potential is indeed preventing the cluster from evaporating. </b>
+
+\subsubsection lyon-ex0-ext Extensions
+
+Try the following exercises if you have time and if you wish to take this exercise further:
+
+- See if you can rewrite the input above in a more compact way by making use of \ref DISTANCES and \ref UWALLS.
+- Checkout the hack-the-tree branch of PLUMED.  This version of PLUMED allows you to pass vector quantities between actions as well as scalars.  See if you can rewrite the above input using this version of PLUMED by making use of 
+  the \ref MATHEVAL and \ref BIASVALUE actions.
+
+\subsection lyon-ex1 Calculating collective variables
+
+We can easily determine if there are transitions between the four possible minima in the Lennard-Jones-seven potential energy landscape by monitoring the second and third central moments of the distribution of coordination numbers.
+The \f$n\f$th central moment of a set of numbers, \f$\{X_i\}\f$ can be calculated using:
+\f[
+\mu^n = \frac{1}{N} \sum_{i=1}^N ( X_i - \langle X \rangle )^n \qquad \textrm{where} \qquad \langle X \rangle = \frac{1}{N} \sum_{i=1}^N X_i
+\f]
+Furthermore, we can compute the coordination number of our Lennard Jones atoms using:
+\f[
+c_i = \sum_{i \ne j } \frac{1 - \left(\frac{r_{ij}}{1.5}\right)^8}{1 - \left(\frac{r_{ij}}{1.5}\right)^{16} }
+\f]
+where \f$r_{ij}\f$__FILL__ is the distance between atom \f$i\f$ and atom \f$j\f$.  The following cell contains a skeleton input file for PLUMED that gets it to calculate and print the values these two quantities take every 100 time steps 
+to a file called colvar.
+
+\plumedfile
+c1: COORDINATIONNUMBER SPECIES=__FILL__ MOMENTS=__FILL__ SWITCH={RATIONAL __FILL__ }
+PRINT ARG=__FILL__ FILE=colvar STRIDE=100
+\endplumedfile
+
+<b> Copy and paste the content above into your plumed.dat file after the lines that you added in the previous exercise that added the restraints to stop the cluster from evaporating.  Fill in the blanks in the input above and,
+once you have a working plumed.dat file, run a calculation using simplemd at a temperature of \f$k_B T = 0.2 \epsilon\f$.  Once you have generated the trajectory plot the time series of second and third moment values using 
+gnuplot or some other plotting package. </b>
+
+\hidden{Example time series}
+
+\anchor lyon-time-series
+\image html lyon-time-series.png "Time series for the second and third moments of the distribution of coordination numbers for two trajectories of Lennard Jones 7.  The time series on the left was computed from a trajectory that was performed at \f$k_B T = 0.2 \epsilon\f$, while the two time series on the right were computed at a temperature of \f$k_B T = 0.1 \epsilon\f$.  You can see that at the higher temperature there are jumps between distinct minima in the energy landscape.  At the lower temperature, however, the system remains trapped in a single basin in the energy lanscape"
+
+\endhidden
+
+\subsubsection lyon-ex1-ext Extensions
+
+Try the following exercises if you have time and if you wish to take this exercise further:
+
+- Investigate how \ref SPRINT coordinates are defined and calculated using PLUMED and try to compute them for this particular system. 
+- Read about other collective variables that can be calculated using PLUMED.  A topic that is particularly important to understand is how PLUMED deals with periodic boundary conditions.
+
+\subsection lyon-ex2 Estimating the free energy surface 
+
+When you plotted the time series for the collective variables in the previous exercise you should have seen that the values of these collective variables fluctuate around a number of different constant values for long 
+periods of time.  There are then infrequent jumps and after these jumps the value of the collective variable settles down and begins to fluctuate around a different constant value.  There are jumps in the value of the CV
+whenever the system moves from one minimums illustrated above to another.  What we would ideally like to extract from our simulation are the relative free energies of these various minimums.  From elementary statistical 
+mechanics, however, we know that the free energy is given by:
+\f[
+F(s) = -k_B T \ln P(s)
+\f]
+Consequently, if we can extract an estimate of the probability that the CVs take particular values, \f$s\f$, then we can get an estimate of the free energy.  Extracting an estimate that for the probability that the collective 
+variables take a particular value is easy, however. We just need to construct a histogram from the CV values that the system took during the trajectory.  The following cell contains a skeleton input file for PLUMED that gets it to
+estimate this histogram and that converts and outputs this estimate of the histogram to a file called fes.dat.
+
+\plumedfile
+hh: HISTOGRAM __FILL__ __FILL__=-1.0,-1.0 __FILL__=1.5,1.5 GRID_BIN=300,300 __FILL__=DISCRETE STRIDE=100
+fes: CONVERT_TO_FES GRID=_FILL__ __FILL__=0.2
+DUMPGRID GRID=__FILL__ FILE=fes.dat
+\endplumedfile
+
+<b> Copy and paste the content above into your plumed.dat file after the lines that you added in the previous two exercises.  Fill in the blanks in the input above and, once you have a working plumed.dat file, run a calculation using simplemd 
+at a temperature of \f$k_B T = 0.2 \epsilon\f$.  Once you have generated the trajectory create a contour plot of the free energy surface that is contained in fes.dat using gnuplot or some other plotting package. </b> 
+
+\hidden{What your free energy surface should look like}
+The free energy surface that you get from this calculation should look something like the figure shown on the left below.  Please note that in order to fully sample configuration space and in order to get a reasonably converged estimate of the free energy
+you are going to have to use many more time steps than you did in previous exercises.  The figure shown on the right is the free energy surface obtained if you run at a temperature of \f$k_B T = 0.1 \epsilon\f$.  As you can see a much smaller volume of 
+configuration space is explored at this lower temperature
+
+\anchor lyon-lj7-fes
+\image html lyon-histograms-highT.png "The free energy for Lennard Jones seven calculated by computing a histogram based on an MD trajectory at \f$k_B T = 0.2 \epsilon\f$ (left) and \f$k_B T = 0.1 \epsilon\f$."
+
+\endhidden
+
+<b> Rerun the calculation of the free energy surface at a lower temperature.  What differences do you observe at the lower temperature?  Why might these differences be problematic? </b>
+
+\hidden{Running at a lower temperature}
+When you run the simulation at a lower temperature you should see that the system no longer explores all the basins in the free energy landscape.  If the temperature is sufficiently low the system
+will remain trapped in the original configuration throughout the whole simulation run.  At very low temperatures this may be the correct behavior and the free energy estimate you obtain may prove
+to be a reasonable estimate.  At the same time, however, lowering the temperature makes the barrier crossing a rarer event.  It thus becomes less likely that you will observe a crossing between basins
+in a lower temperature simulation.  The fact that the histogram shows that the probability of observing these configurations is zero may thus simply be a consequence of the fact that the simulation 
+is not converged.  The free energy will thus look like the one shown below instead of looking like the one in the previous hidden section:
+
+\anchor lyon-lj7-fes2
+\image html lyon-histograms-lowT.png "The free energy for Lennard Jones seven calculated by computing a histogram based on an MD trajectory at \f$k_B T = 0.2 \epsilon\f$ (left) and \f$k_B T = 0.1 \epsilon\f$."
+  
+\endhidden
+
+\subsubsection lyon-ex2-ext Extensions
+
+Try the following exercises if you have time and if you wish to take this exercise further:
+
+- Try to recalculate the free energy surface but use kernel density estimation when calculating the \ref HISTOGRAM.
+- Work through \ref trieste-2 and try to apply the ideas this tutorial covers to calculate appropriate error bars the estimate of the free energy surface that you extracted in the previous exercises.
+
+\subsection lyon-ex3 Performing metadynamics
+
+We can use an enhanced sampling algorithm to overcome the problems that we encountered at the end of the last exercise and to more fully explore configuration space even at low temperature.  In this 
+final exercise we are thus 
+going to perform a metadynamics simulation on our Lennard Jones cluster.  In metadynamics the system is forced to explore more of configuration space by a history dependent bias that has the following 
+functional form:
+\f[
+V(s,t) = \sum_{\tau=0}^t w(\tau) \exp\left( -\frac{( s(t) - s(\tau))^2 }{ 2\sigma^2} \right)
+\f]
+This bias potential consists of a sum of Gaussian kernels with bandwidth \f$\sigma\f$ that are centered at the values of the CV that the system took at previous times.  The weights of the kernels in this
+sum are time dependent because we are going to use the well-tempered variant of metadynamics which introduces one further parameter \f$\gamma = \frac{T + \Delta T}{T}\f$.  
+
+To run metadynamics on the Lennard Jones cluster that we have been studying in this exercise using PLUMED you would delete the lines that you used to compute the histogram from the plumed input file
+that we have been using thus far and you would add a line like the one shown below.
+
+\plumedfile
+METAD ARG=__FILL__ HEIGHT=__FILL__ PACE=__FILL__ SIGMA=__FILL__ GRID_MIN=-1.5,-1.5 GRID_MAX=2.5,2.5 GRID_BIN=500,500 BIASFACTOR=5
+\endplumedfile
+
+<b> This input should be modified to instruct PLUMED to add Gaussian kernels with a bandwidth of 0.1 in both the second and third moment of the distribution of coordination numbers and a height of 0.05 \f$\epsilon\f$ every 50 MD 
+steps.  The metadynamics calculation should then be run using simplemd at a temperature of \f$k_B T = 0.1 \epsilon\f$. </b>
+
+A nice feature of the metadynamics method is that, if the simulation is converged, the final bias potential is a related to the underlying free energy surface by:
+\f[
+F(s) = - \frac{T + \Delta T}{\Delta T} V(s,t)
+\f]   
+This formula is implemented in the tool sum_hills that is part of the PLUMED package.  To run sum_hills on the output of your metadynamics simulation you can run the command:
+
+\verbatim
+plumed sum_hills --hills HILLS
+\endverbatim
+
+This command should output a file called fes.dat that contains the final estimate of the free energy surface that was calculated using metadynamics.
+
+<b> Try to run the sum_hills command on the HILLS file output by your metadynamics simulation.  Visualize the fes.dat file that is output by using gnuplot or some similar plotting package. </b>
+
+\subsubsection lyon-ex3-ext Extensions
+
+Try the following exercises if you have time and if you wish to take this exercise further:
+
+- Work through \ref trieste-2 and try to apply the ideas this tutorial covers to calculate an estimate of the free energy surface using metadynamics and reweighting.
+- Try to estimate the errors on your estimate of the free energy surface.
+
+\section lyon-conclusions Conclusions
+
+This exercise has hopefully given you a sense of the sorts of calculations that can be performed using PLUMED.  Hopefully you now have an understanding of how PLUMED input 
+files are structured so that variables can be passed between different actions.  Furthermore, you should have seen how PLUMED can be used both to analyze trajectories and to 
+add in additional forces that force the system to undergo transitions that it would not otherwise undergo.
+
+Obviously, given the short time available to us in this tutorial,  we have only scratched the surface in terms of what PLUMED can be used to do and in terms of how metadynamics 
+calculations are run in practice.  If you want to understand more about how to run PLUMED we would recommend that you start by reading the chapter that we provided a link to 
+in the introduction to this tutorial.  For now though well done on completing the exercise and good luck with your future PLUMED-related endeavors.
+
+*/
+
+link: @subpage lyon
+
+description: This tutorial gives a brief introduction to PLUMED by showing how it can be used to study a cluster of 7 Lennard Jones atoms.
+
+additional-files: lyon
diff --git a/user-doc/tutorials/lyon/.solutions/plumed_histograms.dat b/user-doc/tutorials/lyon/.solutions/plumed_histograms.dat
new file mode 100755
index 0000000000000000000000000000000000000000..5564db57bd991e1f2f19612104b0bee4bf710283
--- /dev/null
+++ b/user-doc/tutorials/lyon/.solutions/plumed_histograms.dat
@@ -0,0 +1,22 @@
+UNITS NATURAL
+COM ATOMS=1-7 LABEL=com
+DISTANCE ATOMS=1,com LABEL=d1
+UPPER_WALLS ARG=d1 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=2,com LABEL=d2
+UPPER_WALLS ARG=d2 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=3,com LABEL=d3
+UPPER_WALLS ARG=d3 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=4,com LABEL=d4
+UPPER_WALLS ARG=d4 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=5,com LABEL=d5
+UPPER_WALLS ARG=d5 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=6,com LABEL=d6
+UPPER_WALLS ARG=d6 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=7,com LABEL=d7
+UPPER_WALLS ARG=d7 AT=2.0 KAPPA=100.
+
+c1: COORDINATIONNUMBER SPECIES=1-7 MOMENTS=2-3 SWITCH={RATIONAL R_0=1.5 NN=8 MM=16} 
+
+hh: HISTOGRAM ARG=c1.* GRID_MIN=-1.0,-1.0 GRID_MAX=1.5,1.5 GRID_BIN=300,300 KERNEL=DISCRETE STRIDE=100 
+fes: CONVERT_TO_FES GRID=hh TEMP=0.2 
+DUMPGRID GRID=fes FILE=fes.dat 
diff --git a/user-doc/tutorials/lyon/.solutions/plumed_just_walls.dat b/user-doc/tutorials/lyon/.solutions/plumed_just_walls.dat
new file mode 100755
index 0000000000000000000000000000000000000000..2bf44ea5df4838481c7b6048544a3ba10ae551c9
--- /dev/null
+++ b/user-doc/tutorials/lyon/.solutions/plumed_just_walls.dat
@@ -0,0 +1,16 @@
+UNITS NATURAL
+COM ATOMS=1-7 LABEL=com
+DISTANCE ATOMS=1,com LABEL=d1
+UPPER_WALLS ARG=d1 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=2,com LABEL=d2
+UPPER_WALLS ARG=d2 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=3,com LABEL=d3
+UPPER_WALLS ARG=d3 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=4,com LABEL=d4
+UPPER_WALLS ARG=d4 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=5,com LABEL=d5
+UPPER_WALLS ARG=d5 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=6,com LABEL=d6
+UPPER_WALLS ARG=d6 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=7,com LABEL=d7
+UPPER_WALLS ARG=d7 AT=2.0 KAPPA=100.
diff --git a/user-doc/tutorials/lyon/.solutions/plumed_metad.dat b/user-doc/tutorials/lyon/.solutions/plumed_metad.dat
new file mode 100755
index 0000000000000000000000000000000000000000..9810c83cea4dc78da227addd8856957647d7f447
--- /dev/null
+++ b/user-doc/tutorials/lyon/.solutions/plumed_metad.dat
@@ -0,0 +1,23 @@
+UNITS NATURAL
+COM ATOMS=1-7 LABEL=com
+DISTANCE ATOMS=1,com LABEL=d1
+UPPER_WALLS ARG=d1 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=2,com LABEL=d2
+UPPER_WALLS ARG=d2 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=3,com LABEL=d3
+UPPER_WALLS ARG=d3 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=4,com LABEL=d4
+UPPER_WALLS ARG=d4 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=5,com LABEL=d5
+UPPER_WALLS ARG=d5 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=6,com LABEL=d6
+UPPER_WALLS ARG=d6 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=7,com LABEL=d7
+UPPER_WALLS ARG=d7 AT=2.0 KAPPA=100.
+
+c1: COORDINATIONNUMBER SPECIES=1-7 MOMENTS=2-3 SWITCH={RATIONAL R_0=1.5 NN=8 MM=16} 
+
+METAD ARG=c1.* HEIGHT=0.05 PACE=50 SIGMA=0.1,0.1 GRID_MIN=-1.5,-1.5 GRID_MAX=2.5,2.5 GRID_BIN=500,500 BIASFACTOR=5
+
+# PRINT ARG=c1.* FILE=colvar STRIDE=100
+
diff --git a/user-doc/tutorials/lyon/.solutions/plumed_timeseries.dat b/user-doc/tutorials/lyon/.solutions/plumed_timeseries.dat
new file mode 100755
index 0000000000000000000000000000000000000000..ad05eb95a2d921729bf690c60baa3ed471ab522d
--- /dev/null
+++ b/user-doc/tutorials/lyon/.solutions/plumed_timeseries.dat
@@ -0,0 +1,20 @@
+UNITS NATURAL
+COM ATOMS=1-7 LABEL=com
+DISTANCE ATOMS=1,com LABEL=d1
+UPPER_WALLS ARG=d1 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=2,com LABEL=d2
+UPPER_WALLS ARG=d2 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=3,com LABEL=d3
+UPPER_WALLS ARG=d3 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=4,com LABEL=d4
+UPPER_WALLS ARG=d4 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=5,com LABEL=d5
+UPPER_WALLS ARG=d5 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=6,com LABEL=d6
+UPPER_WALLS ARG=d6 AT=2.0 KAPPA=100.
+DISTANCE ATOMS=7,com LABEL=d7
+UPPER_WALLS ARG=d7 AT=2.0 KAPPA=100.
+
+c1: COORDINATIONNUMBER SPECIES=1-7 MOMENTS=2-3 SWITCH={RATIONAL R_0=1.5 NN=8 MM=16} 
+PRINT ARG=c1.* FILE=colvar STRIDE=100
+
diff --git a/user-doc/tutorials/lyon/in b/user-doc/tutorials/lyon/in
new file mode 100644
index 0000000000000000000000000000000000000000..7c3405ed90a73758a33b5fa1a5f23d3c5eb2977a
--- /dev/null
+++ b/user-doc/tutorials/lyon/in
@@ -0,0 +1,11 @@
+inputfile input.xyz
+outputfile output.xyz
+temperature 0.5
+tstep 0.005
+friction 1
+forcecutoff 2.5
+listcutoff  3.0
+ndim 2
+nstep 200000
+nconfig 1000 trajectory.xyz
+nstat   1000 energies.dat
diff --git a/user-doc/tutorials/lyon/input.xyz b/user-doc/tutorials/lyon/input.xyz
new file mode 100644
index 0000000000000000000000000000000000000000..9e1e59174c1ce7a8d31fa8220d64ea05a9edbc14
--- /dev/null
+++ b/user-doc/tutorials/lyon/input.xyz
@@ -0,0 +1,9 @@
+7  
+100. 100. 100.       
+Ar 7.3933470660       -2.6986483924        0.0000000000
+Ar 7.8226765198       -0.7390907295        0.0000000000
+Ar 7.1014969839       -1.6164766614        0.0000000000
+Ar 8.2357184242       -1.7097824975        0.0000000000
+Ar 6.7372520842       -0.5111536183        0.0000000000
+Ar 6.3777119489       -2.4640437401        0.0000000000
+Ar 5.9900631495       -1.3385375043        0.0000000000
diff --git a/user-doc/tutorials/lyon/plumed.dat b/user-doc/tutorials/lyon/plumed.dat
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/user-doc/tutorials/others/isdb-2.txt b/user-doc/tutorials/others/isdb-2.txt
index aadf9f76e36838974557752a44d4ac562ee16957..d470387d3caf55800b31cd6dc09a10b79aa293c7 100644
--- a/user-doc/tutorials/others/isdb-2.txt
+++ b/user-doc/tutorials/others/isdb-2.txt
@@ -1,11 +1,11 @@
 /**
-\page isdb-2 ISDB: setting up a SAXS posprocessing and refinment calculation using MARTINI form factors 
+\page isdb-2 ISDB: setting up a SAXS post processing and refinement calculation using MARTINI form factors 
 
 \authors Cristina Paissoni 
 
 \section isdb-2-aims Aims
 
-This tutorial is thought to illustrate how it is possible to compute SAXS intensities from single PDB files or trajectories using PLUMED. In particular, we will show how to compute scattering intensities with the hybrid coarse-grained/atomistic approach that is described in \cite paissoni:19 . 
+This tutorial is thought to illustrate how it is possible to compute SAXS intensities from single PDB files or trajectories using PLUMED. In particular, we will show how to compute scattering intensities with the hybrid coarse-grained/atomistic approach that is described in \cite Paissoni:2019ee . 
 The tutorial will provide basic instructions to prepare files for the back-calculation of SAXS intensities using the hybrid approach (this process is simplified by the possibility to use an ad-hoc python script). Further, it is explained how to adjust the plumed input file for specific purposes, e.g. to use SAXS data as restraints in MD simulations or to compare SAXS intensities computed with both the atomistic and the hybrid approach.
 
 \section isdb-2-objectives Objectives
diff --git a/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p2.py b/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p2.py
index 71ffd26f050f86ad748e37b0c1e0a8a775e6d580..7c1884b08f837f3be61e5f978c292341e5d7a62f 100644
--- a/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p2.py
+++ b/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p2.py
@@ -7,8 +7,6 @@
 from __future__ import division
 import sys,logging
 logging.basicConfig(format='%(levelname)-7s    %(message)s',level=9)
-import os.path
-import re
 
     
 # This is a simple and versatily option class that allows easy
@@ -32,7 +30,6 @@ class Option:
             self.value = [ self.func(i) for i in v ]
     
 options = [
-
     ("-f",        Option(str,1,None, "PDB Input file.\n\t\tDifferent chains (or chain breaks) should be separated by TER or named with different chain-ID.\n\t\tOnly one model is expected. ENDMDL is accepted only at the end of the file.")),
     ("-dat",      Option(str,1,None,"Experimental Data Input file.\n\t\tEach line should contain momentum transfer (in inverse Ang) and non-zero intensity, separated by blanks or comma.\n\t\tIf momentum transfer is in inverse nm, use the option \"-unit nm\"")),
     ("-unit",     Option(str,1,"Ang","Unit of the momentum transfer [Ang/nm]. Default: Ang")),
@@ -73,8 +70,6 @@ def help():
     sys.exit()
 
 
-
-
 #################################################
 ## 2 # HELPER FUNCTIONS, CLASSES AND SHORTCUTS ##  -> @FUNC <-
 #################################################
@@ -286,7 +281,6 @@ def aver(b):
 # [ name, resname, resid, chain, x, y, z ]
 def mapCG(r):
     p = CoarseGrained.mapping[r[0][1]]     # Mapping for this residue 
-    flat_p = [item for sublist in p for item in sublist]
     # Get the atom_name, mass, coordinates (x,y,z), atom id for all atoms in the residue
     a = [(i[0],CoarseGrained.mass.get(i[0][0],0),i[4:7],i[7]) for i in r]               
     # Store weight, coordinate and index for atoms that match a bead
@@ -301,7 +295,7 @@ def mapCG(r):
 #######################
 ## 4 # STRUCTURE I/O ##  -> @IO <-
 #######################
-import logging,math,random,sys
+import math,sys
 
 #----+---------+
 ## A | PDB I/O |
@@ -534,8 +528,7 @@ class Residue(list):
             for i in self:
                 if i[0] == tag:
                     return i
-            else:
-                return 
+            return
         if tag[1]:
             return [i for i in self if tag[0] in i[0]] # Return partial matches
         else:
@@ -637,6 +630,10 @@ class Chain:
     def __eq__(self,other):
         return (self.seq        == other.seq    and 
                 self.breaks     == other.breaks )
+    
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
 
     # Extract a residue by number or the list of residues of a given type
     # This facilitates selecting residues for links, like chain["CYS"]
@@ -652,30 +649,27 @@ class Chain:
             for i in self.cg():
                 if other == i[:4]:
                     return i
-            else:
-                for i in self.atoms():
-                    if other[:3] == i[:3]:
-                        return i
-                else:
-                    return []
+            for i in self.atoms():
+                if other[:3] == i[:3]:
+                    return i
+            return []
+        elif type(other) == slice:
+            # This implements the old __getslice__ method 
+            i, j = other.start, other.stop
+            newchain = Chain(self.options, name=self.id)
+            # Extract the slices from all lists
+            for attr in self._attributes:
+                setattr(newchain, attr, getattr(self, attr)[i:j])
+            # Breaks that fall within the start and end of this chain need to be passed on.
+            # Residue numbering is increased by 20 bits!!
+            ch_sta, ch_end      = newchain.residues[0][0][2], newchain.residues[-1][0][2]
+            newchain.breaks     = [crack for crack in self.breaks if ch_sta < (crack << 20) < ch_end]
+            newchain.natoms     = len(newchain.atoms())
+            # newchain.type()
+            # Return the chain slice
+            return newchain
         return self.sequence[other]
 
-    # Extract a piece of a chain as a new chain
-    def __getslice__(self,i,j):
-        newchain = Chain(self.options,name=self.id)        
-        # Extract the slices from all lists
-        for attr in self._attributes:           
-            setattr(newchain, attr, getattr(self,attr)[i:j])
-        # Breaks that fall within the start and end of this chain need to be passed on.
-        # Residue numbering is increased by 20 bits!!
-        # XXX I don't know if this works.
-        ch_sta,ch_end = newchain.residues[0][0][2],newchain.residues[-1][0][2]
-        newchain.breaks     = [crack for crack in self.breaks if ch_sta < (crack<<20) < ch_end]
-        newchain.natoms     = len(newchain.atoms())
-        newchain.type()
-        # Return the chain slice
-        return newchain
-
     def _contains(self,atomlist,atom):
         atnm,resn,resi,chn = atom
         
@@ -826,7 +820,7 @@ class Chain:
 #############
 ## 8 # MAIN #  -> @MAIN <-
 #############
-import sys,logging,random,math,os,re
+import sys,math
 
 def main(options):
     # Check whether to read from a gro/pdb file or from stdin
@@ -1049,7 +1043,7 @@ def main(options):
 
 if __name__ == '__main__':
 
-    import sys,logging,time
+    import sys,time
 
     start = time.time()
     stloc = time.localtime(start)
@@ -1060,10 +1054,8 @@ if __name__ == '__main__':
         logging.error("NO INPUT! Try to use the option -h for help.")
         sys.exit(1)
 
-    options = options
     options = option_parser(args,options)
 
-
     if options["-f"].value is None:
         logging.error("No input PDB file. Try to use the option -h for help.")
         sys.exit(1)
@@ -1072,11 +1064,8 @@ if __name__ == '__main__':
         logging.error("No input data file. Try to use the option -h for help.")
         sys.exit(1)
 
-
-
     main(options)
 
-        
     stop = time.time()
     stoploc = time.localtime(stop)
     logging.info("\n\nEnded at time: {}:{}:{} of {}/{}/{}.".format(stoploc[3],stoploc[4],stoploc[5],stoploc[2],stoploc[1],stoploc[0]))
diff --git a/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p3.py b/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p3.py
index 5830224dd12e2441ae8764e6f79bd1dcb3a1be09..bb363def63f5e82f1189400e5e54b3985a6a6852 100644
--- a/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p3.py
+++ b/user-doc/tutorials/others/isdb-2/TUTORIAL_SAXS-AACG/martiniFormFactor_p3.py
@@ -4,11 +4,8 @@
 ###################################
 ## 1 # OPTIONS AND DOCUMENTATION ##  -> @DOC <-
 ###################################
-
 import sys,logging
 logging.basicConfig(format='%(levelname)-7s    %(message)s',level=9)
-import os.path
-import re
 
     
 # This is a simple and versatily option class that allows easy
@@ -283,7 +280,6 @@ def aver(b):
 # [ name, resname, resid, chain, x, y, z ]
 def mapCG(r):
     p = CoarseGrained.mapping[r[0][1]]     # Mapping for this residue 
-    flat_p = [item for sublist in p for item in sublist]
     # Get the atom_name, mass, coordinates (x,y,z), atom id for all atoms in the residue
     a = [(i[0],CoarseGrained.mass.get(i[0][0],0),i[4:7],i[7]) for i in r]               
     # Store weight, coordinate and index for atoms that match a bead
@@ -298,7 +294,7 @@ def mapCG(r):
 #######################
 ## 4 # STRUCTURE I/O ##  -> @IO <-
 #######################
-import logging,math,random,sys
+import math,sys
 
 #----+---------+
 ## A | PDB I/O |
@@ -531,8 +527,7 @@ class Residue(list):
             for i in self:
                 if i[0] == tag:
                     return i
-            else:
-                return 
+            return
         if tag[1]:
             return [i for i in self if tag[0] in i[0]] # Return partial matches
         else:
@@ -634,6 +629,10 @@ class Chain:
     def __eq__(self,other):
         return (self.seq        == other.seq    and 
                 self.breaks     == other.breaks )
+    
+    def __ne__(self, other):
+        return not self.__eq__(other)
+    
 
     # Extract a residue by number or the list of residues of a given type
     # This facilitates selecting residues for links, like chain["CYS"]
@@ -649,12 +648,10 @@ class Chain:
             for i in self.cg():
                 if other == i[:4]:
                     return i
-            else:
-                for i in self.atoms():
-                    if other[:3] == i[:3]:
-                        return i
-                else:
-                    return []
+            for i in self.atoms():
+                if other[:3] == i[:3]:
+                    return i
+            return []
         elif type(other) == slice:
             # This implements the old __getslice__ method 
             i, j = other.start, other.stop
@@ -671,23 +668,6 @@ class Chain:
             # Return the chain slice
             return newchain
         return self.sequence[other]
-        return self.sequence[other]
-
-    # Extract a piece of a chain as a new chain
-    def __getslice__(self,i,j):
-        newchain = Chain(self.options,name=self.id)        
-        # Extract the slices from all lists
-        for attr in self._attributes:           
-            setattr(newchain, attr, getattr(self,attr)[i:j])
-        # Breaks that fall within the start and end of this chain need to be passed on.
-        # Residue numbering is increased by 20 bits!!
-        # XXX I don't know if this works.
-        ch_sta,ch_end = newchain.residues[0][0][2],newchain.residues[-1][0][2]
-        newchain.breaks     = [crack for crack in self.breaks if ch_sta < (crack<<20) < ch_end]
-        newchain.natoms     = len(newchain.atoms())
-        newchain.type()
-        # Return the chain slice
-        return newchain
 
     def _contains(self,atomlist,atom):
         atnm,resn,resi,chn = atom
@@ -839,7 +819,7 @@ class Chain:
 #############
 ## 8 # MAIN #  -> @MAIN <-
 #############
-import sys,logging,random,math,os,re
+import sys,math
 
 def main(options):
     # Check whether to read from a gro/pdb file or from stdin
@@ -1062,7 +1042,7 @@ def main(options):
 
 if __name__ == '__main__':
 
-    import sys,logging,time
+    import sys,time
 
     start = time.time()
     stloc = time.localtime(start)
@@ -1073,7 +1053,6 @@ if __name__ == '__main__':
         logging.error("NO INPUT! Try to use the option -h for help.")
         sys.exit(1)
 
-    options = options
     options = option_parser(args,options)
 
     if options["-f"].value is None:
@@ -1084,7 +1063,7 @@ if __name__ == '__main__':
         logging.error("No input data file. Try to use the option -h for help.")
         sys.exit(1)
 
-    main(options) 
+    main(options)
 
     stop = time.time()
     stoploc = time.localtime(stop)
diff --git a/user-doc/tutorials/performance-optimization.txt b/user-doc/tutorials/performance-optimization.txt
index 1b088838b27463e981cebf456b78dc661a22f8f4..c6d42929bbb7d70d7b0d01e5e634188ebd4d6922 100644
--- a/user-doc/tutorials/performance-optimization.txt
+++ b/user-doc/tutorials/performance-optimization.txt
@@ -4,7 +4,7 @@
 \authors Giovanni Bussi
 \date February 19, 2019
 
-This document describes the PLUMED tutorial held at CINECA, Feburary 2019.
+This document describes the PLUMED tutorial held at CINECA, February 2019.
 The aim of this tutorial is to learn how to optimize simulations performed using PLUMED.
 Although the presented input files are correct,
 the users are invited to **refer to the literature to understand how the
@@ -105,7 +105,7 @@ Notice that running with DETAILED_TIMERS might slow down a bit more your simulat
 Each line tells you how expensive was the calculation of each collective variable.
 Find which is the most expensive! We will focus on it in the next points
 
-\subsection performance-optimization-2d Optmizing coordination numbers using neighbor lists
+\subsection performance-optimization-2d Optimizing coordination numbers using neighbor lists
 
 The \ref COORDINATION of multiple atoms can be very expensive for a number of reasons:
 - It might require the calculation of a large number of distances
@@ -164,7 +164,7 @@ You will probably see that using neighbor lists is usually inconvenient unless y
 So far we have used PLUMED to analyze a CV on the fly. In principle, when you analyze CVs you typically only print them with a given
 stride (say, every 100 steps). This of course moderates significantly their cost. However, when you bias a CV you typically need to
 compute it at every step.
-Let's say that you want to use a restraint to make sure that the Mg ion is always hexacoordinated with water.
+Let's say that you want to use a restraint to make sure that the Mg ion is always six coordinated with water.
 This can be obtained using a \ref RESTRAINT located AT=6. Remove the \ref PRINT action from your input file
 and replace it with the following actions:
 \plumedfile
diff --git a/user-doc/tutorials/trieste-3/do_block_histo.py b/user-doc/tutorials/trieste-3/do_block_histo.py
index c17607e88b81619da208055fac2802f9fa7da10f..15da93bb99ce823f07114d6c15170f7f924cdcef 100755
--- a/user-doc/tutorials/trieste-3/do_block_histo.py
+++ b/user-doc/tutorials/trieste-3/do_block_histo.py
@@ -1,4 +1,3 @@
-import math
 import glob
 import numpy as np
 # Here are some numbers you will need to change if you run this script on grids generated in different contexts