diff --git a/patches/gromacs-2019.1.config b/patches/gromacs-2019.1.config
new file mode 100644
index 0000000000000000000000000000000000000000..4348d93fd2f9c9db2567ee332f7ba123c58b7abc
--- /dev/null
+++ b/patches/gromacs-2019.1.config
@@ -0,0 +1,33 @@
+
+
+function plumed_preliminary_test(){
+# check if the README contains the word GROMACS and if gromacs has been already configured
+  grep -q GROMACS README 1>/dev/null 2>/dev/null
+}
+
+function plumed_patch_info(){
+cat << EOF
+PLUMED can be incorporated into gromacs using the standard patching procedure.
+Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
+
+On clusters you may want to patch gromacs using the static version of plumed, in this case
+building gromacs can result in multiple errors. One possible solution is to configure gromacs
+with these additional options:
+
+cmake -DBUILD_SHARED_LIBS=OFF -DGMX_PREFER_STATIC_LIBS=ON
+
+To enable PLUMED in a gromacs simulation one should use
+mdrun with an extra -plumed flag. The flag can be used to
+specify the name of the PLUMED input file, e.g.:
+
+gmx mdrun -plumed plumed.dat
+
+For more information on gromacs you should visit http://www.gromacs.org
+
+EOF
+}
+
+plumed_before_patch(){
+  plumed_patch_info
+}
+
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80f7d30c13f328603ca75f3b37b4d1b0e97eb22c
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt
@@ -0,0 +1,446 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+add_library(libgromacs_external OBJECT "")
+if(CMAKE_COMPILER_IS_GNUCXX)
+    # Keep quiet about e.g. linearalgebra module
+    target_compile_options(libgromacs_external PRIVATE ${CXXFLAGS_NO_STRINGOP_TRUNCATION})
+endif()
+
+add_library(libgromacs_generated OBJECT "")
+if (BUILD_SHARED_LIBS)
+    set_target_properties(libgromacs_external PROPERTIES POSITION_INDEPENDENT_CODE true)
+    set_target_properties(libgromacs_generated PROPERTIES POSITION_INDEPENDENT_CODE true)
+endif()
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrun)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(restraint)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+add_subdirectory(compat)
+add_subdirectory(mimic)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+target_sources(libgromacs_external PRIVATE ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
+gmx_configure_version_file(
+    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH
+    EXTRA_VARS
+        GMX_SOURCE_DOI
+    )
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE}
+     $<TARGET_OBJECTS:libgromacs_external>
+     $<TARGET_OBJECTS:libgromacs_generated>)
+
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+if (GMX_USE_OPENCL)
+    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
+    mark_as_advanced(GMX_EXTERNAL_CLFFT)
+
+    # Default to using clFFT found on the system
+    # switch to quiet at the second run.
+    if (DEFINED clFFT_LIBRARY)
+        set (clFFT_FIND_QUIETLY TRUE)
+    endif()
+    find_package(clFFT)
+    if (NOT clFFT_FOUND)
+        if (GMX_EXTERNAL_CLFFT)
+            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
+        endif()
+
+        if(MSVC)
+            message(FATAL_ERROR
+"An OpenCL build was requested with Visual Studio compiler, but GROMACS
+requires clFFT, which was not found on your system. GROMACS does bundle
+clFFT to help with building for OpenCL, but that clFFT has not yet been
+ported to the more recent versions of that compiler that GROMACS itself
+requires. Thus for now, OpenCL is not available with MSVC and the internal
+build of clFFT in GROMACS 2019. Either change compiler, try installing
+a clFFT package, or use the latest GROMACS 2018 point release.")
+        endif()
+
+        # Fall back on the internal version
+        set (_clFFT_dir ../external/clFFT/src)
+        add_subdirectory(${_clFFT_dir} clFFT-build)
+        target_sources(libgromacs PRIVATE
+            $<TARGET_OBJECTS:clFFT>
+        )
+        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
+        # Use the magic variable for how to link any library needed for
+        # dlopen, etc.  which is -ldl where needed, and empty otherwise
+        # (e.g. Windows, BSD, Mac).
+        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
+    else()
+        target_link_libraries(libgromacs PRIVATE clFFT)
+    endif()
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions). Same issue exists for nonbonded kernels
+# so we supress them for all generated files.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-unused -Wno-unused-parameter" HAS_NO_UNUSED)
+check_cxx_compiler_flag(-Wno-missing-declarations HAS_NO_MISSING_DECL)
+check_cxx_compiler_flag(-Wno-missing-prototypes HAS_NO_MISSING_PROTO)
+check_cxx_compiler_flag(/wd4101 HAS_NO_MSVC_UNUSED)
+if (NOT MSVC)
+    check_cxx_compiler_flag(-wd1419 HAS_DECL_IN_SOURCE)
+endif()
+if (HAS_NO_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-unused;-Wno-unused-parameter")
+endif()
+if (HAS_NO_MISSING_DECL)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-declarations")
+endif()
+# TODO The group scheme kernels don't use proper function prototype
+# declarations, and clang warns about such use, which we suppress
+# rather than fix. We would prefer to use no suppressions. However
+# other compilers do not support such a warning suppression for C++
+# source files, and issue warnings about that. Remove the use of
+# -Wno-missing-prototypes here and above when the group scheme is
+# removed.
+if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
+endif()
+if (HAS_NO_MSVC_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "/wd4101")
+endif()
+if (HAS_DECL_IN_SOURCE)
+    target_compile_options(libgromacs_generated PRIVATE "-wd1419")
+endif()
+
+if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
+    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
+    # with the SIMD_AVX_512_CXX_FLAGS flags.
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
+                      ${OpenCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      ${PLUMED_LOAD}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_manage_lmfit()
+target_link_libraries(libgromacs PRIVATE lmfit)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^6\.0")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
+endif()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
+     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
+     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
+     /wd6001  #unitialized memory
+     /wd6011  #derefencing NULL
+     /wd6053  #prior call not zero-terminate
+     /wd6054  #might not be zero-terminated
+     /wd6385  #reading invalid data
+     /wd6386  #buffer overrun
+     /wd6387  #could be '0'
+     /wd28199 #uninitialized memory
+     # For compile time constant (e.g. templates) the following warnings have flase postives
+     /wd6239  #(<non-zero> && <expr>)
+     /wd6240  #(<expr> && <non-zero>)
+     /wd6294  #Ill-defined for-loop
+     /wd6326  #comparison of constant with other constant
+     /wd28020 #expression involving paramter is not true
+     # Misc
+     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
+     /wd6993  #OpenMP ignored
+     #TODO
+     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
+     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
+     /wd6246  #hides declaration
+     >
+   )
+endif()
+
+if (GMX_CLANG_TIDY)
+   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
+       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
+endif()
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    # Install the utility headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        gpu_utils/vectype_ops.clh
+        gpu_utils/device_utils.clh
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        pbcutil/ishift.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
+        COMPONENT libraries)
+
+    # Install the NB source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
+        COMPONENT libraries)
+
+    # Install the PME source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        ewald/pme-spread.clh
+        ewald/pme-solve.clh
+        ewald/pme-gather.clh
+        ewald/pme-gpu-utils.clh
+        ewald/pme-program.cl
+        ewald/pme-gpu-types.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
+        COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..f94af553a91047274f879856836782856d62abcb
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -0,0 +1,445 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+add_library(libgromacs_external OBJECT "")
+if(CMAKE_COMPILER_IS_GNUCXX)
+    # Keep quiet about e.g. linearalgebra module
+    target_compile_options(libgromacs_external PRIVATE ${CXXFLAGS_NO_STRINGOP_TRUNCATION})
+endif()
+
+add_library(libgromacs_generated OBJECT "")
+if (BUILD_SHARED_LIBS)
+    set_target_properties(libgromacs_external PROPERTIES POSITION_INDEPENDENT_CODE true)
+    set_target_properties(libgromacs_generated PROPERTIES POSITION_INDEPENDENT_CODE true)
+endif()
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrun)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(restraint)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+add_subdirectory(compat)
+add_subdirectory(mimic)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+target_sources(libgromacs_external PRIVATE ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
+gmx_configure_version_file(
+    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH
+    EXTRA_VARS
+        GMX_SOURCE_DOI
+    )
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE}
+     $<TARGET_OBJECTS:libgromacs_external>
+     $<TARGET_OBJECTS:libgromacs_generated>)
+
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+if (GMX_USE_OPENCL)
+    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
+    mark_as_advanced(GMX_EXTERNAL_CLFFT)
+
+    # Default to using clFFT found on the system
+    # switch to quiet at the second run.
+    if (DEFINED clFFT_LIBRARY)
+        set (clFFT_FIND_QUIETLY TRUE)
+    endif()
+    find_package(clFFT)
+    if (NOT clFFT_FOUND)
+        if (GMX_EXTERNAL_CLFFT)
+            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
+        endif()
+
+        if(MSVC)
+            message(FATAL_ERROR
+"An OpenCL build was requested with Visual Studio compiler, but GROMACS
+requires clFFT, which was not found on your system. GROMACS does bundle
+clFFT to help with building for OpenCL, but that clFFT has not yet been
+ported to the more recent versions of that compiler that GROMACS itself
+requires. Thus for now, OpenCL is not available with MSVC and the internal
+build of clFFT in GROMACS 2019. Either change compiler, try installing
+a clFFT package, or use the latest GROMACS 2018 point release.")
+        endif()
+
+        # Fall back on the internal version
+        set (_clFFT_dir ../external/clFFT/src)
+        add_subdirectory(${_clFFT_dir} clFFT-build)
+        target_sources(libgromacs PRIVATE
+            $<TARGET_OBJECTS:clFFT>
+        )
+        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
+        # Use the magic variable for how to link any library needed for
+        # dlopen, etc.  which is -ldl where needed, and empty otherwise
+        # (e.g. Windows, BSD, Mac).
+        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
+    else()
+        target_link_libraries(libgromacs PRIVATE clFFT)
+    endif()
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions). Same issue exists for nonbonded kernels
+# so we supress them for all generated files.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-unused -Wno-unused-parameter" HAS_NO_UNUSED)
+check_cxx_compiler_flag(-Wno-missing-declarations HAS_NO_MISSING_DECL)
+check_cxx_compiler_flag(-Wno-missing-prototypes HAS_NO_MISSING_PROTO)
+check_cxx_compiler_flag(/wd4101 HAS_NO_MSVC_UNUSED)
+if (NOT MSVC)
+    check_cxx_compiler_flag(-wd1419 HAS_DECL_IN_SOURCE)
+endif()
+if (HAS_NO_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-unused;-Wno-unused-parameter")
+endif()
+if (HAS_NO_MISSING_DECL)
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-declarations")
+endif()
+# TODO The group scheme kernels don't use proper function prototype
+# declarations, and clang warns about such use, which we suppress
+# rather than fix. We would prefer to use no suppressions. However
+# other compilers do not support such a warning suppression for C++
+# source files, and issue warnings about that. Remove the use of
+# -Wno-missing-prototypes here and above when the group scheme is
+# removed.
+if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
+endif()
+if (HAS_NO_MSVC_UNUSED)
+    target_compile_options(libgromacs_generated PRIVATE "/wd4101")
+endif()
+if (HAS_DECL_IN_SOURCE)
+    target_compile_options(libgromacs_generated PRIVATE "-wd1419")
+endif()
+
+if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
+    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
+    # with the SIMD_AVX_512_CXX_FLAGS flags.
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
+                      ${OpenCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_manage_lmfit()
+target_link_libraries(libgromacs PRIVATE lmfit)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^6\.0")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
+endif()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
+     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
+     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
+     /wd6001  #unitialized memory
+     /wd6011  #derefencing NULL
+     /wd6053  #prior call not zero-terminate
+     /wd6054  #might not be zero-terminated
+     /wd6385  #reading invalid data
+     /wd6386  #buffer overrun
+     /wd6387  #could be '0'
+     /wd28199 #uninitialized memory
+     # For compile time constant (e.g. templates) the following warnings have flase postives
+     /wd6239  #(<non-zero> && <expr>)
+     /wd6240  #(<expr> && <non-zero>)
+     /wd6294  #Ill-defined for-loop
+     /wd6326  #comparison of constant with other constant
+     /wd28020 #expression involving paramter is not true
+     # Misc
+     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
+     /wd6993  #OpenMP ignored
+     #TODO
+     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
+     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
+     /wd6246  #hides declaration
+     >
+   )
+endif()
+
+if (GMX_CLANG_TIDY)
+   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
+       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
+endif()
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    # Install the utility headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        gpu_utils/vectype_ops.clh
+        gpu_utils/device_utils.clh
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        pbcutil/ishift.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
+        COMPONENT libraries)
+
+    # Install the NB source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
+        COMPONENT libraries)
+    file(GLOB OPENCL_INSTALLED_FILES
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+        mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
+        COMPONENT libraries)
+
+    # Install the PME source and headers
+    file(GLOB OPENCL_INSTALLED_FILES
+        ewald/pme-spread.clh
+        ewald/pme-solve.clh
+        ewald/pme-gather.clh
+        ewald/pme-gpu-utils.clh
+        ewald/pme-program.cl
+        ewald/pme-gpu-types.h
+        )
+    install(FILES ${OPENCL_INSTALLED_FILES}
+        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
+        COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32bbd0cc08ab73c71ffa8cbfba8f0cdb32d8224e
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp
@@ -0,0 +1,879 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/rf_util.h"
+#include "gromacs/mdlib/wall.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+/* PLUMED */
+#include "../../../Plumed.h"
+int    plumedswitch=0;
+plumed plumedmain;
+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
+/* END PLUMED */
+
+void ns(FILE               *fp,
+        t_forcerec         *fr,
+        matrix              box,
+        const gmx_groups_t *groups,
+        gmx_localtop_t     *top,
+        const t_mdatoms    *md,
+        const t_commrec    *cr,
+        t_nrnb             *nrnb,
+        gmx_bool            bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec           *fr,
+                       const t_inputrec     *ir,
+                       const t_idef         *idef,
+                       const t_commrec      *cr,
+                       const gmx_multisim_t *ms,
+                       t_nrnb               *nrnb,
+                       gmx_wallcycle_t       wcycle,
+                       const t_mdatoms      *md,
+                       rvec                  x[],
+                       history_t            *hist,
+                       rvec                 *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t       *enerd,
+                       t_fcdata             *fcd,
+                       matrix                box,
+                       t_lambda             *fepvals,
+                       real                 *lambda,
+                       const t_graph        *graph,
+                       const t_blocka       *excl,
+                       rvec                  mu_tot[],
+                       int                   flags,
+                       float                *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(*ir, *fr, box, *md, x,
+                                   forceWithVirial, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr, ms,
+                    idef, x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+                    flags);
+
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           (md->nChargePerturbed != 0) || (md->nTypePerturbed != 0),
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        // TODO it would be simpler if we just accumulated a single
+        // long-range virial contribution.
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, DOMAINDECOMP(cr),
+                                   x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      int plumedNeedsEnergy;
+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
+    }
+    /* END PLUMED */
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, const real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
+
+        for (gmx::index j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            const double dlam  = fepvals->all_lambda[j][i] - lambda[j];
+
+            enerpart_lambda   += dlam*enerd->dvdl_lin[j];
+
+            /* Constraints can not be evaluated at foreign lambdas, so we add
+             * a linear extrapolation. This is an approximation, but usually
+             * quite accurate since constraints change little between lambdas.
+             */
+            if ((j == efptBONDED && fepvals->separate_dvdl[efptBONDED]) ||
+                (j == efptFEP && !fepvals->separate_dvdl[efptBONDED]))
+            {
+                enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
+            }
+
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
+            {
+                enerpart_lambda += dlam*enerd->term[F_DKDL];
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        enerpart_lambda - enerd->enerpart_lambda[0],
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+
+    /* The constrain contribution is now included in other terms, so clear it */
+    enerd->term[F_DVDL_CONSTR] = 0;
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..8a7615365140ca43108e1a445b360f36696a93b3
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdlib/force.cpp.preplumed
@@ -0,0 +1,866 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/rf_util.h"
+#include "gromacs/mdlib/wall.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+void ns(FILE               *fp,
+        t_forcerec         *fr,
+        matrix              box,
+        const gmx_groups_t *groups,
+        gmx_localtop_t     *top,
+        const t_mdatoms    *md,
+        const t_commrec    *cr,
+        t_nrnb             *nrnb,
+        gmx_bool            bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec           *fr,
+                       const t_inputrec     *ir,
+                       const t_idef         *idef,
+                       const t_commrec      *cr,
+                       const gmx_multisim_t *ms,
+                       t_nrnb               *nrnb,
+                       gmx_wallcycle_t       wcycle,
+                       const t_mdatoms      *md,
+                       rvec                  x[],
+                       history_t            *hist,
+                       rvec                 *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t       *enerd,
+                       t_fcdata             *fcd,
+                       matrix                box,
+                       t_lambda             *fepvals,
+                       real                 *lambda,
+                       const t_graph        *graph,
+                       const t_blocka       *excl,
+                       rvec                  mu_tot[],
+                       int                   flags,
+                       float                *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(*ir, *fr, box, *md, x,
+                                   forceWithVirial, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr, ms,
+                    idef, x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+                    flags);
+
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           (md->nChargePerturbed != 0) || (md->nTypePerturbed != 0),
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        // TODO it would be simpler if we just accumulated a single
+        // long-range virial contribution.
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, DOMAINDECOMP(cr),
+                                   x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, const real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
+
+        for (gmx::index j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            const double dlam  = fepvals->all_lambda[j][i] - lambda[j];
+
+            enerpart_lambda   += dlam*enerd->dvdl_lin[j];
+
+            /* Constraints can not be evaluated at foreign lambdas, so we add
+             * a linear extrapolation. This is an approximation, but usually
+             * quite accurate since constraints change little between lambdas.
+             */
+            if ((j == efptBONDED && fepvals->separate_dvdl[efptBONDED]) ||
+                (j == efptFEP && !fepvals->separate_dvdl[efptBONDED]))
+            {
+                enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
+            }
+
+            if (j == efptMASS && !fepvals->separate_dvdl[j])
+            {
+                enerpart_lambda += dlam*enerd->term[F_DKDL];
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        enerpart_lambda - enerd->enerpart_lambda[0],
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+
+    /* The constrain contribution is now included in other terms, so clear it */
+    enerd->term[F_DVDL_CONSTR] = 0;
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d9c29a1d5ad78296794ffa6fd2f8a6599aa560c
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
@@ -0,0 +1,263 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "legacymdrunoptions.h"
+
+#include "config.h"
+
+#include <cstring>
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+namespace gmx
+{
+
+/*! \brief Return whether the command-line parameter that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+int LegacyMdrunOptions::updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc)
+{
+    unsigned long     PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multidir, the working directory still needs to be
+    // changed, so we can't check for the existence of files during
+    // parsing.  It isn't useful to do any completion based on file
+    // system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    if (!parse_common_args(&argc, argv, PCA_Flags,
+                           static_cast<int>(filenames.size()), filenames.data(), asize(pa), pa,
+                           static_cast<int>(desc.size()), desc.data(), 0, nullptr, &oenv))
+    {
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    // now check for a multi-simulation
+    ArrayRef<const std::string> multidir = opt2fnsIfOptionSet("-multidir",
+                                                              static_cast<int>(filenames.size()),
+                                                              filenames.data());
+
+    if (replExParams.exchangeInterval != 0 && multidir.size() < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (use option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    ms = init_multisystem(MPI_COMM_WORLD, multidir);
+
+    /* Prepare the intra-simulation communication */
+    // TODO consolidate this with init_commrec, after changing the
+    // relative ordering of init_commrec and init_multisystem
+#if GMX_MPI
+    if (ms != nullptr)
+    {
+        cr->nnodes = cr->nnodes / ms->nsim;
+        MPI_Comm_split(MPI_COMM_WORLD, ms->sim, cr->sim_nodeid, &cr->mpi_comm_mysim);
+        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
+        MPI_Comm_rank(cr->mpi_comm_mysim, &cr->sim_nodeid);
+        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
+    }
+#endif
+
+    if (!opt2bSet("-cpi",
+                  static_cast<int>(filenames.size()), filenames.data()))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, ms, bTryToAppendFiles,
+                  static_cast<int>(filenames.size()),
+                  filenames.data(),
+                  &continuationOptions.appendFiles,
+                  &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun",
+                                             static_cast<int>(filenames.size()),
+                                             filenames.data());
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
+    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
+    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
+
+    /* PLUMED */
+    plumedswitch=0;
+    if (opt2bSet("-plumed", static_cast<int>(filenames.size()), filenames.data())) plumedswitch=1;
+    if(plumedswitch){
+      plumedcmd=plumed_cmd;
+      int real_precision=sizeof(real);
+      real energyUnits=1.0;
+      real lengthUnits=1.0;
+      real timeUnits=1.0;
+  
+      if(!plumed_installed()){
+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
+      }
+      plumedmain=plumed_create();
+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
+      // this is not necessary for gromacs units:
+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
+      //
+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,static_cast<int>(filenames.size()), filenames.data()));
+      plumedswitch=1;
+    }
+    /* PLUMED HREX*/
+    if(getenv("PLUMED_HREX")) plumed_hrex=1;
+    if(plumed_hrex){
+      if(!plumedswitch) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) requires -plumed");
+      if(replExParams.exchangeInterval==0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) replica exchange");
+      if(replExParams.numExchanges!=0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) not compatible with -nex");
+    }
+    /* END PLUMED HREX */
+
+    /* END PLUMED */
+
+    return 1;
+}
+
+LegacyMdrunOptions::~LegacyMdrunOptions()
+{
+    if (GMX_LIB_MPI)
+    {
+        done_commrec(cr);
+    }
+    done_multisim(ms);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..de685e90348bfff377e9fe6cf20efa9fc2d516d6
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
@@ -0,0 +1,222 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "legacymdrunoptions.h"
+
+#include "config.h"
+
+#include <cstring>
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+namespace gmx
+{
+
+/*! \brief Return whether the command-line parameter that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+int LegacyMdrunOptions::updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc)
+{
+    unsigned long     PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multidir, the working directory still needs to be
+    // changed, so we can't check for the existence of files during
+    // parsing.  It isn't useful to do any completion based on file
+    // system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    if (!parse_common_args(&argc, argv, PCA_Flags,
+                           static_cast<int>(filenames.size()), filenames.data(), asize(pa), pa,
+                           static_cast<int>(desc.size()), desc.data(), 0, nullptr, &oenv))
+    {
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    // now check for a multi-simulation
+    ArrayRef<const std::string> multidir = opt2fnsIfOptionSet("-multidir",
+                                                              static_cast<int>(filenames.size()),
+                                                              filenames.data());
+
+    if (replExParams.exchangeInterval != 0 && multidir.size() < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (use option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    ms = init_multisystem(MPI_COMM_WORLD, multidir);
+
+    /* Prepare the intra-simulation communication */
+    // TODO consolidate this with init_commrec, after changing the
+    // relative ordering of init_commrec and init_multisystem
+#if GMX_MPI
+    if (ms != nullptr)
+    {
+        cr->nnodes = cr->nnodes / ms->nsim;
+        MPI_Comm_split(MPI_COMM_WORLD, ms->sim, cr->sim_nodeid, &cr->mpi_comm_mysim);
+        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
+        MPI_Comm_rank(cr->mpi_comm_mysim, &cr->sim_nodeid);
+        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
+    }
+#endif
+
+    if (!opt2bSet("-cpi",
+                  static_cast<int>(filenames.size()), filenames.data()))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, ms, bTryToAppendFiles,
+                  static_cast<int>(filenames.size()),
+                  filenames.data(),
+                  &continuationOptions.appendFiles,
+                  &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun",
+                                             static_cast<int>(filenames.size()),
+                                             filenames.data());
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
+    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
+    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
+
+    return 1;
+}
+
+LegacyMdrunOptions::~LegacyMdrunOptions()
+{
+    if (GMX_LIB_MPI)
+    {
+        done_commrec(cr);
+    }
+    done_multisim(ms);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..88520e58be55f24710cedde0e53bd016d7301c35
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h
@@ -0,0 +1,301 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ * \inlibraryapi
+ */
+#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrun/logging.h"
+
+#include "replicaexchange.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+/* PLUMED HREX */
+int plumed_hrex;
+/* END PLUMED HREX */
+
+struct gmx_multisim_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief This class provides the same command-line option
+ * functionality to both CLI and API sessions.
+ *
+ * This class should not exist, but is necessary now to introduce
+ * support for the CLI and API without duplicating code. It should be
+ * eliminated following the TODOs below.
+ *
+ * \todo Modules in mdrun should acquire proper option handling so
+ * that all of these declarations and defaults are local to the
+ * modules.
+ *
+ * \todo Contextual aspects, such as working directory, MPI
+ * environment, and environment variable handling are more properly
+ * the role of SimulationContext, and should be moved there */
+class LegacyMdrunOptions
+{
+    public:
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Parallelism-related user options.
+        gmx_hw_opt_t                     hw_opt;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+
+        //! Filename options to fill from command-line argument values.
+        std::vector<t_filenm> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-plumed",   "plumed",    ffOPTRD },   /* PLUMED */
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+
+        /*! \brief Output context for writing text files
+         *
+         * \todo Clarify initialization, ownership, and lifetime. */
+        gmx_output_env_t                *oenv = nullptr;
+
+        //! Handle to file used for logging.
+        LogFilePtr logFileGuard = nullptr;
+
+        /*! \brief Command line options, defaults, docs and storage for them to fill. */
+        /*! \{ */
+        rvec              realddxyz                                               = {0, 0, 0};
+        const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+        { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+        const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+        { nullptr, "auto", "no", "yes", nullptr };
+        const char       *thread_aff_opt_choices[threadaffNR+1] =
+        { nullptr, "auto", "on", "off", nullptr };
+        const char       *nbpu_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_fft_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *bonded_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        gmx_bool          bTryToAppendFiles     = TRUE;
+        const char       *gpuIdsAvailable       = "";
+        const char       *userGpuTaskAssignment = "";
+
+        ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+        t_pargs           pa[49] = {
+
+            { "-dd",      FALSE, etRVEC, {&realddxyz},
+              "Domain decomposition grid, 0 is optimize" },
+            { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+              "DD rank order" },
+            { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+              "Number of separate ranks to be used for PME, -1 is guess" },
+            { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+              "Total number of threads to start (0 is guess)" },
+            { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+              "Number of thread-MPI ranks to start (0 is guess)" },
+            { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+              "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+            { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+              "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+            { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+              "Whether mdrun should try to set thread affinities" },
+            { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+              "The lowest logical core number to which mdrun should pin the first thread" },
+            { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+              "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+            { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+              "List of unique GPU device IDs available to use" },
+            { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+              "List of GPU device IDs, mapping each PP task on each node to a device" },
+            { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+              "Check for all bonded interactions with DD" },
+            { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+              "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+            { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+              "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+            { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+              "Maximum distance for P-LINCS (nm), 0 is estimate" },
+            { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+              "Dynamic load balancing (with DD)" },
+            { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+              "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+              "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+            { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+              "HIDDENA string containing a vector of the relative sizes in the x "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+              "HIDDENA string containing a vector of the relative sizes in the y "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+              "HIDDENA string containing a vector of the relative sizes in the z "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+              "Global communication frequency" },
+            { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+              "Calculate non-bonded interactions on" },
+            { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+              "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+            { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+              "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+            { "-pme",     FALSE, etENUM, {pme_opt_choices},
+              "Perform PME calculations on" },
+            { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+              "Perform PME FFT calculations on" },
+            { "-bonded",     FALSE, etENUM, {bonded_opt_choices},
+              "Perform bonded calculations on" },
+            { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+              "Be loud and noisy" },
+            { "-pforce",  FALSE, etREAL, {&pforce},
+              "Print all forces larger than this (kJ/mol nm)" },
+            { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+              "Try to avoid optimizations that affect binary reproducibility" },
+            { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+              "Checkpoint interval (minutes)" },
+            { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+              "Keep and number checkpoint files" },
+            { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+              "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+            { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+              "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+            { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+              "Terminate after 0.99 times this time (hours)" },
+            { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+              "Attempt replica exchange periodically with this period (steps)" },
+            { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+              "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+            { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+              "Seed for replica exchange, -1 is generate a seed" },
+            { "-hrex",  FALSE, etBOOL, {&plumed_hrex}, /* PLUMED HREX */
+              "Enable hamiltonian replica exchange" },
+            { "-imdport",    FALSE, etINT, {&imdOptions.port},
+              "HIDDENIMD listening port" },
+            { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+              "HIDDENPause the simulation while no IMD client is connected" },
+            { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+              "HIDDENAllow termination of the simulation from IMD client" },
+            { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+              "HIDDENAllow pulling in the simulation from IMD client" },
+            { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+              "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+            { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+              "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+            { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+              "HIDDENFrequency of writing the remaining wall clock time for the run" },
+            { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+              "HIDDENReset cycle counters after these many time steps" },
+            { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+              "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+        };
+        /*! \} */
+
+        //! Handle to communication object.
+        t_commrec        *cr = nullptr;
+        //! Multi-simulation object.
+        gmx_multisim_t   *ms = nullptr;
+
+        //! Parses the command-line input and prepares to start mdrun.
+        int updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc);
+
+        ~LegacyMdrunOptions();
+};
+
+} // end namespace gmx
+
+#endif
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..637cd2a6b93dabe9fd632d92e30794922c3a8fec
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief This file declares helper functionality for legacy option handling for mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ * \inlibraryapi
+ */
+#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrun/logging.h"
+
+#include "replicaexchange.h"
+
+struct gmx_multisim_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief This class provides the same command-line option
+ * functionality to both CLI and API sessions.
+ *
+ * This class should not exist, but is necessary now to introduce
+ * support for the CLI and API without duplicating code. It should be
+ * eliminated following the TODOs below.
+ *
+ * \todo Modules in mdrun should acquire proper option handling so
+ * that all of these declarations and defaults are local to the
+ * modules.
+ *
+ * \todo Contextual aspects, such as working directory, MPI
+ * environment, and environment variable handling are more properly
+ * the role of SimulationContext, and should be moved there */
+class LegacyMdrunOptions
+{
+    public:
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Parallelism-related user options.
+        gmx_hw_opt_t                     hw_opt;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+
+        //! Filename options to fill from command-line argument values.
+        std::vector<t_filenm> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+
+        /*! \brief Output context for writing text files
+         *
+         * \todo Clarify initialization, ownership, and lifetime. */
+        gmx_output_env_t                *oenv = nullptr;
+
+        //! Handle to file used for logging.
+        LogFilePtr logFileGuard = nullptr;
+
+        /*! \brief Command line options, defaults, docs and storage for them to fill. */
+        /*! \{ */
+        rvec              realddxyz                                               = {0, 0, 0};
+        const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+        { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+        const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+        { nullptr, "auto", "no", "yes", nullptr };
+        const char       *thread_aff_opt_choices[threadaffNR+1] =
+        { nullptr, "auto", "on", "off", nullptr };
+        const char       *nbpu_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *pme_fft_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        const char       *bonded_opt_choices[5] =
+        { nullptr, "auto", "cpu", "gpu", nullptr };
+        gmx_bool          bTryToAppendFiles     = TRUE;
+        const char       *gpuIdsAvailable       = "";
+        const char       *userGpuTaskAssignment = "";
+
+        ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+        t_pargs           pa[48] = {
+
+            { "-dd",      FALSE, etRVEC, {&realddxyz},
+              "Domain decomposition grid, 0 is optimize" },
+            { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+              "DD rank order" },
+            { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+              "Number of separate ranks to be used for PME, -1 is guess" },
+            { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+              "Total number of threads to start (0 is guess)" },
+            { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+              "Number of thread-MPI ranks to start (0 is guess)" },
+            { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+              "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+            { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+              "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+            { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+              "Whether mdrun should try to set thread affinities" },
+            { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+              "The lowest logical core number to which mdrun should pin the first thread" },
+            { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+              "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+            { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+              "List of unique GPU device IDs available to use" },
+            { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+              "List of GPU device IDs, mapping each PP task on each node to a device" },
+            { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+              "Check for all bonded interactions with DD" },
+            { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+              "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+            { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+              "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+            { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+              "Maximum distance for P-LINCS (nm), 0 is estimate" },
+            { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+              "Dynamic load balancing (with DD)" },
+            { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+              "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+              "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+            { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+              "HIDDENA string containing a vector of the relative sizes in the x "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+              "HIDDENA string containing a vector of the relative sizes in the y "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+              "HIDDENA string containing a vector of the relative sizes in the z "
+              "direction of the corresponding DD cells. Only effective with static "
+              "load balancing." },
+            { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+              "Global communication frequency" },
+            { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+              "Calculate non-bonded interactions on" },
+            { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+              "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+            { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+              "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+            { "-pme",     FALSE, etENUM, {pme_opt_choices},
+              "Perform PME calculations on" },
+            { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+              "Perform PME FFT calculations on" },
+            { "-bonded",     FALSE, etENUM, {bonded_opt_choices},
+              "Perform bonded calculations on" },
+            { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+              "Be loud and noisy" },
+            { "-pforce",  FALSE, etREAL, {&pforce},
+              "Print all forces larger than this (kJ/mol nm)" },
+            { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+              "Try to avoid optimizations that affect binary reproducibility" },
+            { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+              "Checkpoint interval (minutes)" },
+            { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+              "Keep and number checkpoint files" },
+            { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+              "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+            { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+              "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+            { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+              "Terminate after 0.99 times this time (hours)" },
+            { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+              "Attempt replica exchange periodically with this period (steps)" },
+            { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+              "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+            { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+              "Seed for replica exchange, -1 is generate a seed" },
+            { "-imdport",    FALSE, etINT, {&imdOptions.port},
+              "HIDDENIMD listening port" },
+            { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+              "HIDDENPause the simulation while no IMD client is connected" },
+            { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+              "HIDDENAllow termination of the simulation from IMD client" },
+            { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+              "HIDDENAllow pulling in the simulation from IMD client" },
+            { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+              "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+            { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+              "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+            { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+              "HIDDENFrequency of writing the remaining wall clock time for the run" },
+            { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+              "HIDDENReset cycle counters after these many time steps" },
+            { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+              "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+        };
+        /*! \} */
+
+        //! Handle to communication object.
+        t_commrec        *cr = nullptr;
+        //! Multi-simulation object.
+        gmx_multisim_t   *ms = nullptr;
+
+        //! Parses the command-line input and prepares to start mdrun.
+        int updateFromCommandLine(int argc, char **argv, ArrayRef<const char *> desc);
+
+        ~LegacyMdrunOptions();
+};
+
+} // end namespace gmx
+
+#endif
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..887526a8d1dff7bdb05905ed784c1e500086501c
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp
@@ -0,0 +1,1728 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the integrator for normal molecular dynamics simulations
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <memory>
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/checkpointhandler.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/resethandler.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/pullhistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+using gmx::SimulationSignaller;
+
+void gmx::Integrator::do_md()
+{
+    // TODO Historically, the EM and MD "integrators" used different
+    // names for the t_inputrec *parameter, but these must have the
+    // same name, now that it's a member of a struct. We use this ir
+    // alias to avoid a large ripple of nearly useless changes.
+    // t_inputrec is being replaced by IMdpOptionsProvider, so this
+    // will go away eventually.
+    t_inputrec             *ir   = inputrec;
+    gmx_mdoutf             *outf = nullptr;
+    int64_t                 step, step_rel;
+    double                  t, t0, lam0[efptNR];
+    gmx_bool                bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool                bNS, bNStList, bSimAnn, bStopCM,
+                            bFirstStep, bInitStep, bLastStep = FALSE;
+    gmx_bool                bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool                do_ene, do_log, do_verbose;
+    gmx_bool                bMasterState;
+    int                     force_flags, cglo_flags;
+    tensor                  force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int                     i, m;
+    rvec                    mu_tot;
+    t_vcm                  *vcm;
+    matrix                  parrinellorahmanMu, M;
+    gmx_repl_ex_t           repl_ex = nullptr;
+    gmx_localtop_t         *top;
+    t_mdebin               *mdebin   = nullptr;
+    gmx_enerdata_t         *enerd;
+    PaddedVector<gmx::RVec> f {};
+    gmx_global_stat_t       gstat;
+    gmx_update_t           *upd   = nullptr;
+    t_graph                *graph = nullptr;
+    gmx_groups_t           *groups;
+    gmx_ekindata_t         *ekind;
+    gmx_shellfc_t          *shellfc;
+    gmx_bool                bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool                bTemp, bPres, bTrotter;
+    real                    dvdl_constr;
+    rvec                   *cbuf        = nullptr;
+    int                     cbuf_nalloc = 0;
+    matrix                  lastbox;
+    int                     lamnew  = 0;
+    /* for FEP */
+    int                     nstfep = 0;
+    double                  cycles;
+    real                    saved_conserved_quantity = 0;
+    real                    last_ekin                = 0;
+    t_extmass               MassQ;
+    int                   **trotter_seq;
+    char                    sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    int plumedWantsToStop=0;
+    matrix plumed_vir;
+    /* END PLUMED */
+
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const bool bRerunMD      = false;
+    int        nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir, cr);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    groups = &top_global->groups;
+
+    std::unique_ptr<EssentialDynamics> ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(mdlog,
+                        opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd, deform,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, total_vir, pres, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, constr ? constr->numFlexibleConstraints() : 0,
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, ms, top_global, fplog, ir->nstcalcenergy,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        f.resizeWithPadding(state_global->natoms);
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, constr, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        /* Check nstexpanded here, because the grompp check was broken */
+        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
+        {
+            gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
+        }
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                /* If no history is available (because a checkpoint is from before
+                 * it was written) make a new one later, otherwise restore it.
+                 */
+                if (observablesHistory->energyHistory)
+                {
+                    restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+                }
+            }
+            else if (observablesHistory->energyHistory)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+                /* We might have read a pull history from checkpoint.
+                 * We will still want to keep the statistics, so that the files
+                 * can be joined and still be meaningful.
+                 * This means that observablesHistory->pullHistory
+                 * should not be reset.
+                 */
+            }
+        }
+        if (!observablesHistory->energyHistory)
+        {
+            observablesHistory->energyHistory = compat::make_unique<energyhistory_t>();
+        }
+        if (!observablesHistory->pullHistory)
+        {
+            observablesHistory->pullHistory = compat::make_unique<PullHistory>();
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    preparePrevStepPullCom(ir, mdatoms, state, state_global, cr, startingFromCheckpoint);
+
+    // TODO: Remove this by converting AWH into a ForceProvider
+    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingFromCheckpoint,
+                                shellfc != nullptr,
+                                opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
+                         *fr->ic, *fr->nbv->listParams, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation)
+    {
+        if (state->flags & (1 << estV))
+        {
+            auto v = makeArrayRef(state->v);
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr->rmsd());
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        char tbuf[20];
+        fprintf(stderr, "starting mdrun '%s'\n",
+                *(top_global->name));
+        if (ir->nsteps >= 0)
+        {
+            sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+        }
+        else
+        {
+            sprintf(tbuf, "%s", "infinite");
+        }
+        if (ir->init_step > 0)
+        {
+            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                    gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                    gmx_step_str(ir->init_step, sbuf2),
+                    ir->init_step*ir->delta_t);
+        }
+        else
+        {
+            fprintf(stderr, "%s steps, %s ps.\n",
+                    gmx_step_str(ir->nsteps, sbuf), tbuf);
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      /* detect plumed API version */
+      int pversion=0;
+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
+      /* setting kbT is only implemented with api>1) */
+      real kbT=ir->opts.ref_t[0]*BOLTZ;
+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
+      if(pversion>2){
+        int res=1;
+        if( (continuationOptions.startedFromCheckpoint) ) plumed_cmd(plumedmain,"setRestart",&res);
+      }
+
+      if(ms && ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          int nat_home = dd_numHomeAtoms(*cr->dd);
+          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+        }
+      }
+    }
+    /* END PLUMED */
+
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+#if GMX_FAHCORE
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+    int chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                          NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim || (plumedswitch && ms); // PLUMED hack, if we have multiple sim and plumed we usually want them to be in sync 
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
+                compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
+                MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
+                mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
+
+    auto checkpointHandler = compat::make_unique<CheckpointHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
+                simulationsShareState, ir->nstlist == 0, MASTER(cr),
+                mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
+
+    const bool resetCountersIsLocal = true;
+    auto       resetHandler         = compat::make_unique<ResetHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
+                ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
+                mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           *ir, fr, *state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        bLastStep = (step_rel == ir->nsteps);
+        t         = t0 + step*ir->delta_t;
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas */
+            setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Determine whether or not to do Neighbour Searching */
+        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+
+        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation))
+        {
+            bMasterState = FALSE;
+            /* Correct the new box if it is too skewed */
+            if (inputrecDynamicBox(ir))
+            {
+                if (correct_box(fplog, step, state->box, graph))
+                {
+                    bMasterState = TRUE;
+                }
+            }
+            if (DOMAINDECOMP(cr) && bMasterState)
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, mdlog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+
+                /* PLUMED */
+                if(plumedswitch){
+                  int nat_home = dd_numHomeAtoms(*cr->dd);
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+                }
+                /* END PLUMED */
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if (bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* PLUMED HREX */
+        gmx_bool bHREX = bDoReplEx && plumed_hrex;
+
+        if (plumedswitch && bHREX) {
+          gmx_enerdata_t *hrex_enerd;
+          snew(hrex_enerd, 1);
+          init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,hrex_enerd);
+          int repl  = -1;
+          int nrepl = -1;
+          if (MASTER(cr)){
+            repl  = replica_exchange_get_repl(repl_ex);
+            nrepl = replica_exchange_get_nrepl(repl_ex);
+          }
+
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
+            }
+          }
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+            }
+          }
+          do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                   step, nrnb, wcycle, top, groups,
+                   state->box, state->x.arrayRefWithPadding(), &state->hist,
+                   f.arrayRefWithPadding(), force_vir, mdatoms, hrex_enerd, fcd,
+                   state->lambda, graph,
+                   fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                   GMX_FORCE_STATECHANGED |
+                   GMX_FORCE_DYNAMICBOX |
+                   GMX_FORCE_ALLFORCES |
+                   GMX_FORCE_VIRIAL |
+                   GMX_FORCE_ENERGY |
+                   GMX_FORCE_DHDL |
+                   GMX_FORCE_NS,
+                   ddOpenBalanceRegion, ddCloseBalanceRegion);
+
+          plumed_cmd(plumedmain,"GREX cacheLocalUSwap",&hrex_enerd->term[F_EPOT]);
+          sfree(hrex_enerd);
+
+          /* exchange back */
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
+            }
+          }
+
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+              int nat_home = dd_numHomeAtoms(*cr->dd);
+              plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
+              plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+            }
+          }
+        }
+        /* END PLUMED HREX */
+
+        checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
+                                enforcedRotation, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f.arrayRefWithPadding(), force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, ppForceWorkload, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
+            {
+                awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+
+            /* PLUMED */
+            plumedNeedsEnergy=0;
+            if(plumedswitch){
+              int pversion=0;
+              plumed_cmd(plumedmain,"getApiVersion",&pversion);
+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
+              plumed_cmd(plumedmain,"prepareCalc",NULL);
+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
+              int checkp=0; if(checkpointHandler->isCheckpointingStep()) checkp=1;
+              if(pversion>3) plumed_cmd(plumedmain,"doCheckPoint",&checkp);
+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+              if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
+              clear_mat(plumed_vir);
+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
+            }
+            /* END PLUMED */
+            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                     step, nrnb, wcycle, top, groups,
+                     state->box, state->x.arrayRefWithPadding(), &state->hist,
+                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+            /* PLUMED */
+            if(plumedswitch){
+              if(plumedNeedsEnergy){
+                msmul(force_vir,2.0,plumed_vir);
+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+                plumed_cmd(plumedmain,"performCalc",NULL);
+                msmul(plumed_vir,0.5,force_vir);
+              } else {
+                msmul(plumed_vir,0.5,plumed_vir);
+                m_add(force_vir,plumed_vir,force_vir);
+              }
+              if(bDoReplEx) plumed_cmd(plumedmain,"GREX savePositions",NULL);
+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
+              if(bHREX) plumed_cmd(plumedmain,"GREX cacheLocalUNow",&enerd->term[F_EPOT]);
+            }
+            /* END PLUMED */
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            wallcycle_stop(wcycle, ewcUPDATE);
+            constrain_velocities(step, nullptr,
+                                 state,
+                                 shake_vir,
+                                 constr,
+                                 bCalcVir, do_log, do_ene);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (bCalcEner ? CGLO_ENERGY : 0)
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 checkpointHandler->isCheckpointingStep(),
+                                 bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x.rvec_array(), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        stopHandler->setSignal();
+        resetHandler->setSignal(walltime_accounting);
+
+        if (bGStat || !PAR(cr))
+        {
+            /* In parallel we only have to check for checkpointing in steps
+             * where we do global communication,
+             *  otherwise the other nodes don't know.
+             */
+            checkpointHandler->setSignal(walltime_accounting);
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                constrain_velocities(step, nullptr,
+                                     state,
+                                     tmp_vir,
+                                     constr,
+                                     bCalcVir, do_log, do_ene);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        wallcycle_start(wcycle, ewcUPDATE);
+        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+        if (bTrotter)
+        {
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+            /* We can only do Berendsen coupling after we have summed
+             * the kinetic energy or virial. Since the happens
+             * in global_state after update, we should only do it at
+             * step % nstlist = 1 with bGStatEveryStep=FALSE.
+             */
+        }
+        else
+        {
+            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+            update_pcouple_before_coordinates(fplog, step, ir, state,
+                                              parrinellorahmanMu, M,
+                                              bInitStep);
+        }
+
+        if (EI_VV(ir->eI))
+        {
+            /* velocity half-step update */
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY2,
+                          cr, constr);
+        }
+
+        /* Above, initialize just copies ekinh into ekin,
+         * it doesn't copy position (for VV),
+         * and entire integrator for MD.
+         */
+
+        if (ir->eI == eiVVAK)
+        {
+            /* We probably only need md->homenr, not state->natoms */
+            if (state->natoms > cbuf_nalloc)
+            {
+                cbuf_nalloc = state->natoms;
+                srenew(cbuf, cbuf_nalloc);
+            }
+            copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+        }
+
+        update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                      ekind, M, upd, etrtPOSITION, cr, constr);
+        wallcycle_stop(wcycle, ewcUPDATE);
+
+        constrain_coordinates(step, &dvdl_constr, state,
+                              shake_vir,
+                              upd, constr,
+                              bCalcVir, do_log, do_ene);
+        update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
+                              cr, nrnb, wcycle, upd, constr, do_log, do_ene);
+        finish_update(ir, mdatoms,
+                      state, graph,
+                      nrnb, wcycle, upd, constr);
+
+        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
+        {
+            updatePrevStepPullCom(ir->pull_work, state);
+        }
+
+        if (ir->eI == eiVVAK)
+        {
+            /* erase F_EKIN and F_TEMP here? */
+            /* just compute the kinetic energy at the half step to perform a trotter step */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &nullSignaller, lastbox,
+                            nullptr, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                            );
+            wallcycle_start(wcycle, ewcUPDATE);
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+            /* now we know the scaling, we can compute the positions again again */
+            copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+            /* are the small terms in the shake_vir here due
+             * to numerical errors, or are they important
+             * physically? I'm thinking they are just errors, but not completely sure.
+             * For now, will call without actually constraining, constr=NULL*/
+            finish_update(ir, mdatoms,
+                          state, graph,
+                          nrnb, wcycle, upd, nullptr);
+        }
+        if (EI_VV(ir->eI))
+        {
+            /* this factor or 2 correction is necessary
+               because half of the constraint force is removed
+               in the vv step, so we have to double it.  See
+               the Redmine issue #1255.  It is not yet clear
+               if the factor of 2 is exact, or just a very
+               good approximation, and this will be
+               investigated.  The next step is to see if this
+               can be done adding a dhdl contribution from the
+               rattle step, but this is somewhat more
+               complicated with the current code. Will be
+               investigated, hopefully for 4.6.3. However,
+               this current solution is much better than
+               having it completely wrong.
+             */
+            enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+        }
+        else
+        {
+            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, state->x.rvec_array());
+            }
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, state->x.rvec_array());
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && !EI_VV(ir->eI))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh.get());
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (isMasterSimMasterRank(ms, cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             as_rvec_array(state->x.data()),
+                                             state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        /* increase the MD step number */
+        step++;
+        step_rel++;
+
+        resetHandler->resetCounters(
+                step, step_rel, mdlog, fplog, cr, (use_GPU(fr->nbv) ? fr->nbv : nullptr),
+                nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end_time(walltime_accounting);
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), awh.get());
+        }
+    }
+    done_mdebin(mdebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    destroy_enerdata(enerd);
+    sfree(enerd);
+    sfree(top);
+}
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a49a15df53a4246970f4f4a00e542f1c9a7cedbe
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/md.cpp.preplumed
@@ -0,0 +1,1531 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the integrator for normal molecular dynamics simulations
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <memory>
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/checkpointhandler.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/resethandler.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/pullhistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+using gmx::SimulationSignaller;
+
+void gmx::Integrator::do_md()
+{
+    // TODO Historically, the EM and MD "integrators" used different
+    // names for the t_inputrec *parameter, but these must have the
+    // same name, now that it's a member of a struct. We use this ir
+    // alias to avoid a large ripple of nearly useless changes.
+    // t_inputrec is being replaced by IMdpOptionsProvider, so this
+    // will go away eventually.
+    t_inputrec             *ir   = inputrec;
+    gmx_mdoutf             *outf = nullptr;
+    int64_t                 step, step_rel;
+    double                  t, t0, lam0[efptNR];
+    gmx_bool                bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool                bNS, bNStList, bSimAnn, bStopCM,
+                            bFirstStep, bInitStep, bLastStep = FALSE;
+    gmx_bool                bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool                do_ene, do_log, do_verbose;
+    gmx_bool                bMasterState;
+    int                     force_flags, cglo_flags;
+    tensor                  force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int                     i, m;
+    rvec                    mu_tot;
+    t_vcm                  *vcm;
+    matrix                  parrinellorahmanMu, M;
+    gmx_repl_ex_t           repl_ex = nullptr;
+    gmx_localtop_t         *top;
+    t_mdebin               *mdebin   = nullptr;
+    gmx_enerdata_t         *enerd;
+    PaddedVector<gmx::RVec> f {};
+    gmx_global_stat_t       gstat;
+    gmx_update_t           *upd   = nullptr;
+    t_graph                *graph = nullptr;
+    gmx_groups_t           *groups;
+    gmx_ekindata_t         *ekind;
+    gmx_shellfc_t          *shellfc;
+    gmx_bool                bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool                bTemp, bPres, bTrotter;
+    real                    dvdl_constr;
+    rvec                   *cbuf        = nullptr;
+    int                     cbuf_nalloc = 0;
+    matrix                  lastbox;
+    int                     lamnew  = 0;
+    /* for FEP */
+    int                     nstfep = 0;
+    double                  cycles;
+    real                    saved_conserved_quantity = 0;
+    real                    last_ekin                = 0;
+    t_extmass               MassQ;
+    int                   **trotter_seq;
+    char                    sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const bool bRerunMD      = false;
+    int        nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir, cr);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    groups = &top_global->groups;
+
+    std::unique_ptr<EssentialDynamics> ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(mdlog,
+                        opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd, deform,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, total_vir, pres, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, constr ? constr->numFlexibleConstraints() : 0,
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, ms, top_global, fplog, ir->nstcalcenergy,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        f.resizeWithPadding(state_global->natoms);
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, constr, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        /* Check nstexpanded here, because the grompp check was broken */
+        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
+        {
+            gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
+        }
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                /* If no history is available (because a checkpoint is from before
+                 * it was written) make a new one later, otherwise restore it.
+                 */
+                if (observablesHistory->energyHistory)
+                {
+                    restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+                }
+            }
+            else if (observablesHistory->energyHistory)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+                /* We might have read a pull history from checkpoint.
+                 * We will still want to keep the statistics, so that the files
+                 * can be joined and still be meaningful.
+                 * This means that observablesHistory->pullHistory
+                 * should not be reset.
+                 */
+            }
+        }
+        if (!observablesHistory->energyHistory)
+        {
+            observablesHistory->energyHistory = compat::make_unique<energyhistory_t>();
+        }
+        if (!observablesHistory->pullHistory)
+        {
+            observablesHistory->pullHistory = compat::make_unique<PullHistory>();
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    preparePrevStepPullCom(ir, mdatoms, state, state_global, cr, startingFromCheckpoint);
+
+    // TODO: Remove this by converting AWH into a ForceProvider
+    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingFromCheckpoint,
+                                shellfc != nullptr,
+                                opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
+                         *fr->ic, *fr->nbv->listParams, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation)
+    {
+        if (state->flags & (1 << estV))
+        {
+            auto v = makeArrayRef(state->v);
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr->rmsd());
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        char tbuf[20];
+        fprintf(stderr, "starting mdrun '%s'\n",
+                *(top_global->name));
+        if (ir->nsteps >= 0)
+        {
+            sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+        }
+        else
+        {
+            sprintf(tbuf, "%s", "infinite");
+        }
+        if (ir->init_step > 0)
+        {
+            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                    gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                    gmx_step_str(ir->init_step, sbuf2),
+                    ir->init_step*ir->delta_t);
+        }
+        else
+        {
+            fprintf(stderr, "%s steps, %s ps.\n",
+                    gmx_step_str(ir->nsteps, sbuf), tbuf);
+        }
+        fprintf(fplog, "\n");
+    }
+
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+#if GMX_FAHCORE
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+    int chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                          NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
+                compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
+                MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
+                mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
+
+    auto checkpointHandler = compat::make_unique<CheckpointHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
+                simulationsShareState, ir->nstlist == 0, MASTER(cr),
+                mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
+
+    const bool resetCountersIsLocal = true;
+    auto       resetHandler         = compat::make_unique<ResetHandler>(
+                compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
+                ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
+                mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           *ir, fr, *state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        bLastStep = (step_rel == ir->nsteps);
+        t         = t0 + step*ir->delta_t;
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas */
+            setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Determine whether or not to do Neighbour Searching */
+        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+
+        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation))
+        {
+            bMasterState = FALSE;
+            /* Correct the new box if it is too skewed */
+            if (inputrecDynamicBox(ir))
+            {
+                if (correct_box(fplog, step, state->box, graph))
+                {
+                    bMasterState = TRUE;
+                }
+            }
+            if (DOMAINDECOMP(cr) && bMasterState)
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, mdlog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if (bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
+                                enforcedRotation, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f.arrayRefWithPadding(), force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, ppForceWorkload, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
+            {
+                awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation,
+                     step, nrnb, wcycle, top, groups,
+                     state->box, state->x.arrayRefWithPadding(), &state->hist,
+                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, ppForceWorkload, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            wallcycle_stop(wcycle, ewcUPDATE);
+            constrain_velocities(step, nullptr,
+                                 state,
+                                 shake_vir,
+                                 constr,
+                                 bCalcVir, do_log, do_ene);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (bCalcEner ? CGLO_ENERGY : 0)
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 checkpointHandler->isCheckpointingStep(),
+                                 bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x.rvec_array(), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        stopHandler->setSignal();
+        resetHandler->setSignal(walltime_accounting);
+
+        if (bGStat || !PAR(cr))
+        {
+            /* In parallel we only have to check for checkpointing in steps
+             * where we do global communication,
+             *  otherwise the other nodes don't know.
+             */
+            checkpointHandler->setSignal(walltime_accounting);
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                constrain_velocities(step, nullptr,
+                                     state,
+                                     tmp_vir,
+                                     constr,
+                                     bCalcVir, do_log, do_ene);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        wallcycle_start(wcycle, ewcUPDATE);
+        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+        if (bTrotter)
+        {
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+            /* We can only do Berendsen coupling after we have summed
+             * the kinetic energy or virial. Since the happens
+             * in global_state after update, we should only do it at
+             * step % nstlist = 1 with bGStatEveryStep=FALSE.
+             */
+        }
+        else
+        {
+            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+            update_pcouple_before_coordinates(fplog, step, ir, state,
+                                              parrinellorahmanMu, M,
+                                              bInitStep);
+        }
+
+        if (EI_VV(ir->eI))
+        {
+            /* velocity half-step update */
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtVELOCITY2,
+                          cr, constr);
+        }
+
+        /* Above, initialize just copies ekinh into ekin,
+         * it doesn't copy position (for VV),
+         * and entire integrator for MD.
+         */
+
+        if (ir->eI == eiVVAK)
+        {
+            /* We probably only need md->homenr, not state->natoms */
+            if (state->natoms > cbuf_nalloc)
+            {
+                cbuf_nalloc = state->natoms;
+                srenew(cbuf, cbuf_nalloc);
+            }
+            copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+        }
+
+        update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                      ekind, M, upd, etrtPOSITION, cr, constr);
+        wallcycle_stop(wcycle, ewcUPDATE);
+
+        constrain_coordinates(step, &dvdl_constr, state,
+                              shake_vir,
+                              upd, constr,
+                              bCalcVir, do_log, do_ene);
+        update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
+                              cr, nrnb, wcycle, upd, constr, do_log, do_ene);
+        finish_update(ir, mdatoms,
+                      state, graph,
+                      nrnb, wcycle, upd, constr);
+
+        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
+        {
+            updatePrevStepPullCom(ir->pull_work, state);
+        }
+
+        if (ir->eI == eiVVAK)
+        {
+            /* erase F_EKIN and F_TEMP here? */
+            /* just compute the kinetic energy at the half step to perform a trotter step */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &nullSignaller, lastbox,
+                            nullptr, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                            );
+            wallcycle_start(wcycle, ewcUPDATE);
+            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+            /* now we know the scaling, we can compute the positions again again */
+            copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+            update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+            /* are the small terms in the shake_vir here due
+             * to numerical errors, or are they important
+             * physically? I'm thinking they are just errors, but not completely sure.
+             * For now, will call without actually constraining, constr=NULL*/
+            finish_update(ir, mdatoms,
+                          state, graph,
+                          nrnb, wcycle, upd, nullptr);
+        }
+        if (EI_VV(ir->eI))
+        {
+            /* this factor or 2 correction is necessary
+               because half of the constraint force is removed
+               in the vv step, so we have to double it.  See
+               the Redmine issue #1255.  It is not yet clear
+               if the factor of 2 is exact, or just a very
+               good approximation, and this will be
+               investigated.  The next step is to see if this
+               can be done adding a dhdl contribution from the
+               rattle step, but this is somewhat more
+               complicated with the current code. Will be
+               investigated, hopefully for 4.6.3. However,
+               this current solution is much better than
+               having it completely wrong.
+             */
+            enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+        }
+        else
+        {
+            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, state->x.rvec_array());
+            }
+            construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, state->x.rvec_array());
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && !EI_VV(ir->eI))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh.get());
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (isMasterSimMasterRank(ms, cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             as_rvec_array(state->x.data()),
+                                             state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        /* increase the MD step number */
+        step++;
+        step_rel++;
+
+        resetHandler->resetCounters(
+                step, step_rel, mdlog, fplog, cr, (use_GPU(fr->nbv) ? fr->nbv : nullptr),
+                nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end_time(walltime_accounting);
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), awh.get());
+        }
+    }
+    done_mdebin(mdebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    destroy_enerdata(enerd);
+    sfree(enerd);
+    sfree(top);
+}
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..62a5a07a08daec87adf60e3947b0c3ecb006b4ad
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp
@@ -0,0 +1,3064 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state                 s;
+    //! Force array
+    PaddedVector<gmx::RVec> f;
+    //! Potential energy
+    real                    epot;
+    //! Norm of the force
+    real                    fnorm;
+    //! Maximum force
+    real                    fmax;
+    //! Direction
+    int                     a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           const t_commrec          *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end_time(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE     *fp,
+                      real      ftol,
+                      real      fmax,
+                      gmx_bool  bLastStep,
+                      gmx_bool  bConstrain)
+{
+    constexpr bool realIsDouble = GMX_DOUBLE;
+    char           buffer[2048];
+
+    if (!std::isfinite(fmax))
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped because the force "
+                "on at least one atom is not finite. This usually means "
+                "atoms are overlapping. Modify the input coordinates to "
+                "remove atom overlap or use soft-core potentials with "
+                "the free energy code to avoid infinite forces.\n%s",
+                !realIsDouble ?
+                "You could also be lucky that switching to double precision "
+                "is sufficient to obtain finite forces.\n" :
+                "");
+    }
+    else if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                !realIsDouble ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+
+    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            int64_t count, gmx_bool bDone, int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(const t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->globalAtomIndices[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = gmx::roundToInt(sum[2*i+1]);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(const t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f.rvec_array(),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog,
+                    const gmx::MDLogger &mdlog,
+                    const char *title,
+                    const t_commrec *cr,
+                    const gmx_multisim_t *ms,
+                    gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx::Constraints *constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, ms, top_global, fplog, 1,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      constr ? constr->numFlexibleConstraints() : 0,
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        ems->f.resizeWithPadding(ems->s.natoms);
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  constr, vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        // TODO how should this cross-module support dependency be managed?
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constr->apply(TRUE, TRUE,
+                          -1, 0, 1.0,
+                          ems->s.x.rvec_array(),
+                          ems->s.x.rvec_array(),
+                          nullptr,
+                          ems->s.box,
+                          ems->s.lambda[efptFEP], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+
+    /* PLUMED */
+    if(plumedswitch){
+      if(ms && ms->nsim>1) {
+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        (*plumedcmd) (plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }else{
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
+        }
+      }
+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
+      (*plumedcmd) (plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
+      (*plumedcmd) (plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          int nat_home = dd_numHomeAtoms(*cr->dd);
+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&nat_home);
+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
+
+        }
+      }
+    }
+    /* END PLUMED */
+}
+
+//! Finalize the minimization
+static void finish_em(const t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, const t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, static_cast<double>(step),
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr)
+    {
+        if (DOMAINDECOMP(cr))
+        {
+            /* If bX=true, x was collected to state_global in the call above */
+            if (!bX)
+            {
+                gmx::ArrayRef<gmx::RVec> globalXRef = MASTER(cr) ? makeArrayRef(state_global->x) : gmx::EmptyArrayRef();
+                dd_collect_vec(cr->dd, &state->s, makeArrayRef(state->s.x), globalXRef);
+            }
+        }
+        else
+        {
+            /* Copy the local state pointer */
+            state_global = &state->s;
+        }
+
+        if (MASTER(cr))
+        {
+            if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+            {
+                /* Make molecules whole only for confout writing */
+                do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                            state_global->x.rvec_array());
+            }
+
+            write_sto_conf_mtop(confout,
+                                *top_global->name, top_global,
+                                state_global->x.rvec_array(), nullptr, ir->ePBC, state->s.box);
+        }
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(const t_commrec *cr,
+                       t_inputrec *ir, t_mdatoms *md,
+                       em_state_t *ems1, real a, const PaddedVector<gmx::RVec> *force,
+                       em_state_t *ems2,
+                       gmx::Constraints *constr,
+                       int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        ems2->f.resizeWithPadding(s2->natoms);
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = s1->x.rvec_array();
+        rvec       *x2 = s2->x.rvec_array();
+        const rvec *f  = force->rvec_array();
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = s1->cg_p.rvec_array();
+            rvec       *p2 = s2->cg_p.rvec_array();
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        dvdl_constr = 0;
+        validStep   =
+            constr->apply(TRUE, TRUE,
+                          count, 0, 1.0,
+                          s1->x.rvec_array(), s2->x.rvec_array(),
+                          nullptr, s2->box,
+                          s2->lambda[efptBONDED], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+
+        if (cr->nnodes > 1)
+        {
+            /* This global reduction will affect performance at high
+             * parallelization, but we can not really avoid it.
+             * But usually EM is not run at high parallelization.
+             */
+            int reductionBuffer = static_cast<int>(!validStep);
+            gmx_sumi(1, &reductionBuffer, cr);
+            validStep           = (reductionBuffer == 0);
+        }
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog,
+                                   const gmx::MDLogger &mdlog,
+                                   int step, const t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx::Constraints *constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+namespace
+{
+
+/*! \brief Class to handle the work of setting and doing an energy evaluation.
+ *
+ * This class is a mere aggregate of parameters to pass to evaluate an
+ * energy, so that future changes to names and types of them consume
+ * less time when refactoring other code.
+ *
+ * Aggregate initialization is used, for which the chief risk is that
+ * if a member is added at the end and not all initializer lists are
+ * updated, then the member will be value initialized, which will
+ * typically mean initialization to zero.
+ *
+ * We only want to construct one of these with an initializer list, so
+ * we explicitly delete the default constructor. */
+class EnergyEvaluator
+{
+    public:
+        //! We only intend to construct such objects with an initializer list.
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
+        // Aspects of the C++11 spec changed after GCC 4.8.5, and
+        // compilation of the initializer list construction in
+        // runner.cpp fails in GCC 4.8.5.
+        EnergyEvaluator() = delete;
+#endif
+        /*! \brief Evaluates an energy on the state in \c ems.
+         *
+         * \todo In practice, the same objects mu_tot, vir, and pres
+         * are always passed to this function, so we would rather have
+         * them as data members. However, their C-array types are
+         * unsuited for aggregate initialization. When the types
+         * improve, the call signature of this method can be reduced.
+         */
+        void run(em_state_t *ems, rvec mu_tot,
+                 tensor vir, tensor pres,
+                 int64_t count, gmx_bool bFirst);
+        //! Handles logging (deprecated).
+        FILE                 *fplog;
+        //! Handles logging.
+        const gmx::MDLogger  &mdlog;
+        //! Handles communication.
+        const t_commrec      *cr;
+        //! Coordinates multi-simulations.
+        const gmx_multisim_t *ms;
+        //! Holds the simulation topology.
+        gmx_mtop_t           *top_global;
+        //! Holds the domain topology.
+        gmx_localtop_t       *top;
+        //! User input options.
+        t_inputrec           *inputrec;
+        //! Manages flop accounting.
+        t_nrnb               *nrnb;
+        //! Manages wall cycle accounting.
+        gmx_wallcycle_t       wcycle;
+        //! Coordinates global reduction.
+        gmx_global_stat_t     gstat;
+        //! Handles virtual sites.
+        gmx_vsite_t          *vsite;
+        //! Handles constraints.
+        gmx::Constraints     *constr;
+        //! Handles strange things.
+        t_fcdata             *fcd;
+        //! Molecular graph for SHAKE.
+        t_graph              *graph;
+        //! Per-atom data for this domain.
+        gmx::MDAtoms         *mdAtoms;
+        //! Handles how to calculate the forces.
+        t_forcerec           *fr;
+        //! Schedule of force-calculation work each step for this task.
+        gmx::PpForceWorkload *ppForceWorkload;
+        //! Stores the computed energies.
+        gmx_enerdata_t       *enerd;
+};
+
+void
+EnergyEvaluator::run(em_state_t *ems, rvec mu_tot,
+                     tensor vir, tensor pres,
+                     int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    matrix plumed_vir;
+    if(plumedswitch){
+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&lstep);
+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
+      (*plumedcmd) (plumedmain,"setMasses",&mdAtoms->mdatoms()->massT[0]);
+      (*plumedcmd) (plumedmain,"setCharges",&mdAtoms->mdatoms()->chargeA[0]);
+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      clear_mat(plumed_vir);
+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
+    }
+    /* END PLUMED */
+
+    do_force(fplog, cr, ms, inputrec, nullptr, nullptr,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist,
+             ems->f.arrayRefWithPadding(), force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, ppForceWorkload, vsite, mu_tot, t, nullptr,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+    /* PLUMED */
+    if(plumedswitch){
+      if(plumedNeedsEnergy) {
+        msmul(force_vir,2.0,plumed_vir);
+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+        (*plumedcmd) (plumedmain,"performCalc",NULL);
+        msmul(plumed_vir,0.5,force_vir);
+      } else {
+        msmul(plumed_vir,0.5,plumed_vir);
+        m_add(force_vir,plumed_vir,force_vir);
+      }
+    }
+    /* END PLUMED */
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        dvdl_constr = 0;
+        rvec *f_rvec = ems->f.rvec_array();
+        constr->apply(FALSE, FALSE,
+                      count, 0, 1.0,
+                      ems->s.x.rvec_array(), f_rvec, f_rvec,
+                      ems->s.box,
+                      ems->s.lambda[efptBONDED], &dvdl_constr,
+                      nullptr, &shake_vir, gmx::ConstraintVariable::ForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+} // namespace
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = s_min->f.rvec_array();
+    const rvec *fb = s_b->f.rvec_array();
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = s_min->f.rvec_array();
+        const rvec *fb  = s_b->f.rvec_array();
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+void
+Integrator::do_cg()
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating conjugate gradient energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    step = 0;
+
+    if (MASTER(cr))
+    {
+        // In CG, the state is extended with a search direction
+        state_global->flags |= (1<<estCGP);
+
+        // Ensure the extra per-atom state array gets allocated
+        state_change_natoms(state_global, state_global->natoms);
+
+        // Initialize the search direction to zero
+        for (RVec &cg_p : state_global->cg_p)
+        {
+            cg_p = { 0, 0, 0 };
+        }
+    }
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, mdlog, CG, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = s_min->s.cg_p.rvec_array();
+        const rvec *sfm = s_min->f.rvec_array();
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        auto s_min_x = makeArrayRef(s_min->s.x);
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min_x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*top_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = s_c->s.cg_p.rvec_array();
+        const rvec *sfc = s_c->f.rvec_array();
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = s_b->s.cg_p.rvec_array();
+                const rvec *sfb = s_b->f.rvec_array();
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (MASTER(cr) && do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    /* Note that with 0 < nstfout != nstxout we can end up with two frames
+     * in the trajectory with the same step number.
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+
+void
+Integrator::do_lbfgs()
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating L-BFGS energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, mdlog, LBFGS, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = (inputrec->opts.nFreeze[gf][m] != 0);
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(ems.f.rvec_array()[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, static_cast<real>(step), &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(ems.s.x.rvec_array()[0]);
+        real *ff = static_cast<real *>(ems.f.rvec_array()[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(last->s.x.data()[0]);
+        real *lastf = static_cast<real *>(last->f.data()[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(sc->s.x.rvec_array()[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(sc->f.rvec_array()[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
+        // If true, great, we found a better energy. We no longer try to alter the
+        // stepsize, but simply accept this new better position. The we select a new
+        // search direction instead, which will be much more efficient than continuing
+        // to take smaller steps along a line. Set fnorm based on the new C position,
+        // which will be used to update the stepsize to 1/fnorm further down.
+
+        // If false, the energy is NOT lower in point C, i.e. it will be the same
+        // or higher than in point A. In this case it is pointless to move to point C,
+        // so we will have to do more iterations along the same line to find a smaller
+        // value in the interval [A=0.0,C].
+        // Here, A is still 0.0, but that will change when we do a search in the interval
+        // [0.0,C] below. That search we will do by interpolation or bisection rather
+        // than with the stepsize, so no need to modify it. For the next search direction
+        // it will be reset to 1/fnorm anyway.
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(sb->s.x.rvec_array()[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(sb->f.rvec_array()[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* swap states b and c */
+                    swap_em_state(&sb, &sc);
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* swap states a and b */
+                    swap_em_state(&sa, &sb);
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/ems.fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, ems.fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+void
+Integrator::do_steep()
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating steepest-descent energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, mdlog, SD, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, count);
+        }
+
+        if (validStep)
+        {
+            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                matrix nullBox = {};
+                upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(count),
+                           mdatoms->tmass, enerd, nullptr, nullptr, nullptr,
+                           nullBox, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                          count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+}
+
+void
+Integrator::do_nm()
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*std::sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating normal-mode analysis via the integrator "
+                   ".mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx normal-modes.");
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, mdlog, NM, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<int>       atom_index = get_atom_index(top_global);
+    std::vector<gmx::RVec> fneg(atom_index.size(), {0, 0, 0});
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
+                *(top_global->name), inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS          = true;
+    auto state_work_x = makeArrayRef(state_work.s.x);
+    auto state_work_f = makeArrayRef(state_work.f);
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            int64_t     step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work_x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work_x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work_x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    relax_shell_flexcon(fplog,
+                                        cr,
+                                        ms,
+                                        mdrunOptions.verbose,
+                                        nullptr,
+                                        step,
+                                        inputrec,
+                                        bNS,
+                                        force_flags,
+                                        top,
+                                        constr,
+                                        enerd,
+                                        fcd,
+                                        &state_work.s,
+                                        state_work.f.arrayRefWithPadding(),
+                                        vir,
+                                        mdatoms,
+                                        nrnb,
+                                        wcycle,
+                                        graph,
+                                        &top_global->groups,
+                                        shellfc,
+                                        fr,
+                                        ppForceWorkload,
+                                        t,
+                                        mu_tot,
+                                        vsite,
+                                        DdOpenBalanceRegionBeforeForceComputation::no,
+                                        DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    std::copy(state_work_f.begin(), state_work_f.begin()+atom_index.size(), fneg.begin());
+                }
+            }
+
+            /* x is restored to original */
+            state_work_x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work_f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && aid+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (aid + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..eb8e910468fd99643d8d9f5aa061caa7a10c13ab
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/minimize.cpp.preplumed
@@ -0,0 +1,2988 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "integrator.h"
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state                 s;
+    //! Force array
+    PaddedVector<gmx::RVec> f;
+    //! Potential energy
+    real                    epot;
+    //! Norm of the force
+    real                    fnorm;
+    //! Maximum force
+    real                    fmax;
+    //! Direction
+    int                     a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           const t_commrec          *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start_time(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end_time(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE     *fp,
+                      real      ftol,
+                      real      fmax,
+                      gmx_bool  bLastStep,
+                      gmx_bool  bConstrain)
+{
+    constexpr bool realIsDouble = GMX_DOUBLE;
+    char           buffer[2048];
+
+    if (!std::isfinite(fmax))
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped because the force "
+                "on at least one atom is not finite. This usually means "
+                "atoms are overlapping. Modify the input coordinates to "
+                "remove atom overlap or use soft-core potentials with "
+                "the free energy code to avoid infinite forces.\n%s",
+                !realIsDouble ?
+                "You could also be lucky that switching to double precision "
+                "is sufficient to obtain finite forces.\n" :
+                "");
+    }
+    else if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                !realIsDouble ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+
+    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            int64_t count, gmx_bool bDone, int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(const t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->globalAtomIndices[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = gmx::roundToInt(sum[2*i+1]);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(const t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f.rvec_array(),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog,
+                    const gmx::MDLogger &mdlog,
+                    const char *title,
+                    const t_commrec *cr,
+                    const gmx_multisim_t *ms,
+                    gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx::Constraints *constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, ms, top_global, fplog, 1,
+             MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      constr ? constr->numFlexibleConstraints() : 0,
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        ems->f.resizeWithPadding(ems->s.natoms);
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  constr, vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        // TODO how should this cross-module support dependency be managed?
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constr->apply(TRUE, TRUE,
+                          -1, 0, 1.0,
+                          ems->s.x.rvec_array(),
+                          ems->s.x.rvec_array(),
+                          nullptr,
+                          ems->s.box,
+                          ems->s.lambda[efptFEP], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+}
+
+//! Finalize the minimization
+static void finish_em(const t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, const t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, static_cast<double>(step),
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr)
+    {
+        if (DOMAINDECOMP(cr))
+        {
+            /* If bX=true, x was collected to state_global in the call above */
+            if (!bX)
+            {
+                gmx::ArrayRef<gmx::RVec> globalXRef = MASTER(cr) ? makeArrayRef(state_global->x) : gmx::EmptyArrayRef();
+                dd_collect_vec(cr->dd, &state->s, makeArrayRef(state->s.x), globalXRef);
+            }
+        }
+        else
+        {
+            /* Copy the local state pointer */
+            state_global = &state->s;
+        }
+
+        if (MASTER(cr))
+        {
+            if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+            {
+                /* Make molecules whole only for confout writing */
+                do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                            state_global->x.rvec_array());
+            }
+
+            write_sto_conf_mtop(confout,
+                                *top_global->name, top_global,
+                                state_global->x.rvec_array(), nullptr, ir->ePBC, state->s.box);
+        }
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(const t_commrec *cr,
+                       t_inputrec *ir, t_mdatoms *md,
+                       em_state_t *ems1, real a, const PaddedVector<gmx::RVec> *force,
+                       em_state_t *ems2,
+                       gmx::Constraints *constr,
+                       int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        ems2->f.resizeWithPadding(s2->natoms);
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = s1->x.rvec_array();
+        rvec       *x2 = s2->x.rvec_array();
+        const rvec *f  = force->rvec_array();
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = s1->cg_p.rvec_array();
+            rvec       *p2 = s2->cg_p.rvec_array();
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        dvdl_constr = 0;
+        validStep   =
+            constr->apply(TRUE, TRUE,
+                          count, 0, 1.0,
+                          s1->x.rvec_array(), s2->x.rvec_array(),
+                          nullptr, s2->box,
+                          s2->lambda[efptBONDED], &dvdl_constr,
+                          nullptr, nullptr, gmx::ConstraintVariable::Positions);
+
+        if (cr->nnodes > 1)
+        {
+            /* This global reduction will affect performance at high
+             * parallelization, but we can not really avoid it.
+             * But usually EM is not run at high parallelization.
+             */
+            int reductionBuffer = static_cast<int>(!validStep);
+            gmx_sumi(1, &reductionBuffer, cr);
+            validStep           = (reductionBuffer == 0);
+        }
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog,
+                                   const gmx::MDLogger &mdlog,
+                                   int step, const t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx::Constraints *constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+namespace
+{
+
+/*! \brief Class to handle the work of setting and doing an energy evaluation.
+ *
+ * This class is a mere aggregate of parameters to pass to evaluate an
+ * energy, so that future changes to names and types of them consume
+ * less time when refactoring other code.
+ *
+ * Aggregate initialization is used, for which the chief risk is that
+ * if a member is added at the end and not all initializer lists are
+ * updated, then the member will be value initialized, which will
+ * typically mean initialization to zero.
+ *
+ * We only want to construct one of these with an initializer list, so
+ * we explicitly delete the default constructor. */
+class EnergyEvaluator
+{
+    public:
+        //! We only intend to construct such objects with an initializer list.
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
+        // Aspects of the C++11 spec changed after GCC 4.8.5, and
+        // compilation of the initializer list construction in
+        // runner.cpp fails in GCC 4.8.5.
+        EnergyEvaluator() = delete;
+#endif
+        /*! \brief Evaluates an energy on the state in \c ems.
+         *
+         * \todo In practice, the same objects mu_tot, vir, and pres
+         * are always passed to this function, so we would rather have
+         * them as data members. However, their C-array types are
+         * unsuited for aggregate initialization. When the types
+         * improve, the call signature of this method can be reduced.
+         */
+        void run(em_state_t *ems, rvec mu_tot,
+                 tensor vir, tensor pres,
+                 int64_t count, gmx_bool bFirst);
+        //! Handles logging (deprecated).
+        FILE                 *fplog;
+        //! Handles logging.
+        const gmx::MDLogger  &mdlog;
+        //! Handles communication.
+        const t_commrec      *cr;
+        //! Coordinates multi-simulations.
+        const gmx_multisim_t *ms;
+        //! Holds the simulation topology.
+        gmx_mtop_t           *top_global;
+        //! Holds the domain topology.
+        gmx_localtop_t       *top;
+        //! User input options.
+        t_inputrec           *inputrec;
+        //! Manages flop accounting.
+        t_nrnb               *nrnb;
+        //! Manages wall cycle accounting.
+        gmx_wallcycle_t       wcycle;
+        //! Coordinates global reduction.
+        gmx_global_stat_t     gstat;
+        //! Handles virtual sites.
+        gmx_vsite_t          *vsite;
+        //! Handles constraints.
+        gmx::Constraints     *constr;
+        //! Handles strange things.
+        t_fcdata             *fcd;
+        //! Molecular graph for SHAKE.
+        t_graph              *graph;
+        //! Per-atom data for this domain.
+        gmx::MDAtoms         *mdAtoms;
+        //! Handles how to calculate the forces.
+        t_forcerec           *fr;
+        //! Schedule of force-calculation work each step for this task.
+        gmx::PpForceWorkload *ppForceWorkload;
+        //! Stores the computed energies.
+        gmx_enerdata_t       *enerd;
+};
+
+void
+EnergyEvaluator::run(em_state_t *ems, rvec mu_tot,
+                     tensor vir, tensor pres,
+                     int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    do_force(fplog, cr, ms, inputrec, nullptr, nullptr,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist,
+             ems->f.arrayRefWithPadding(), force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, ppForceWorkload, vsite, mu_tot, t, nullptr,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        dvdl_constr = 0;
+        rvec *f_rvec = ems->f.rvec_array();
+        constr->apply(FALSE, FALSE,
+                      count, 0, 1.0,
+                      ems->s.x.rvec_array(), f_rvec, f_rvec,
+                      ems->s.box,
+                      ems->s.lambda[efptBONDED], &dvdl_constr,
+                      nullptr, &shake_vir, gmx::ConstraintVariable::ForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+} // namespace
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = s_min->f.rvec_array();
+    const rvec *fb = s_b->f.rvec_array();
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = s_min->f.rvec_array();
+        const rvec *fb  = s_b->f.rvec_array();
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+void
+Integrator::do_cg()
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating conjugate gradient energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    step = 0;
+
+    if (MASTER(cr))
+    {
+        // In CG, the state is extended with a search direction
+        state_global->flags |= (1<<estCGP);
+
+        // Ensure the extra per-atom state array gets allocated
+        state_change_natoms(state_global, state_global->natoms);
+
+        // Initialize the search direction to zero
+        for (RVec &cg_p : state_global->cg_p)
+        {
+            cg_p = { 0, 0, 0 };
+        }
+    }
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, mdlog, CG, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = s_min->s.cg_p.rvec_array();
+        const rvec *sfm = s_min->f.rvec_array();
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        auto s_min_x = makeArrayRef(s_min->s.x);
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min_x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*top_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = s_c->s.cg_p.rvec_array();
+        const rvec *sfc = s_c->f.rvec_array();
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = s_b->s.cg_p.rvec_array();
+                const rvec *sfb = s_b->f.rvec_array();
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (MASTER(cr) && do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    /* Note that with 0 < nstfout != nstxout we can end up with two frames
+     * in the trajectory with the same step number.
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+
+void
+Integrator::do_lbfgs()
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating L-BFGS energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, mdlog, LBFGS, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = (inputrec->opts.nFreeze[gf][m] != 0);
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x.rvec_array(), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        matrix nullBox = {};
+        upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                   mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(ems.f.rvec_array()[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, static_cast<real>(step), &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(ems.s.x.rvec_array()[0]);
+        real *ff = static_cast<real *>(ems.f.rvec_array()[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(last->s.x.data()[0]);
+        real *lastf = static_cast<real *>(last->f.data()[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(sc->s.x.rvec_array()[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(sc->f.rvec_array()[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = std::sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
+        // If true, great, we found a better energy. We no longer try to alter the
+        // stepsize, but simply accept this new better position. The we select a new
+        // search direction instead, which will be much more efficient than continuing
+        // to take smaller steps along a line. Set fnorm based on the new C position,
+        // which will be used to update the stepsize to 1/fnorm further down.
+
+        // If false, the energy is NOT lower in point C, i.e. it will be the same
+        // or higher than in point A. In this case it is pointless to move to point C,
+        // so we will have to do more iterations along the same line to find a smaller
+        // value in the interval [A=0.0,C].
+        // Here, A is still 0.0, but that will change when we do a search in the interval
+        // [0.0,C] below. That search we will do by interpolation or bisection rather
+        // than with the stepsize, so no need to modify it. For the next search direction
+        // it will be reset to 1/fnorm anyway.
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(sb->s.x.rvec_array()[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(sb->f.rvec_array()[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* swap states b and c */
+                    swap_em_state(&sb, &sc);
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* swap states a and b */
+                    swap_em_state(&sa, &sb);
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            matrix nullBox = {};
+            upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(step),
+                       mdatoms->tmass, enerd, nullptr, nullptr, nullptr, nullBox,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x.rvec_array(), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/ems.fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(fplog, inputrec->em_tol, ems.fmax,
+                      step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+}
+
+void
+Integrator::do_steep()
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating steepest-descent energy minimization via the "
+                   "integrator .mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx minimize and an .mdp option.");
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, mdlog, SD, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, count);
+        }
+
+        if (validStep)
+        {
+            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                matrix nullBox = {};
+                upd_mdebin(mdebin, FALSE, FALSE, static_cast<double>(count),
+                           mdatoms->tmass, enerd, nullptr, nullptr, nullptr,
+                           nullBox, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(fplog, inputrec->em_tol, s_min->fmax,
+                          count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? state_global->x.rvec_array() : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+}
+
+void
+Integrator::do_nm()
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*std::sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    GMX_LOG(mdlog.info).asParagraph().
+        appendText("Note that activating normal-mode analysis via the integrator "
+                   ".mdp option and the command gmx mdrun may "
+                   "be available in a different form in a future version of GROMACS, "
+                   "e.g. gmx normal-modes.");
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, mdlog, NM, cr, ms, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<int>       atom_index = get_atom_index(top_global);
+    std::vector<gmx::RVec> fneg(atom_index.size(), {0, 0, 0});
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
+                *(top_global->name), inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    EnergyEvaluator energyEvaluator {
+        fplog, mdlog, cr, ms,
+        top_global, top,
+        inputrec, nrnb, wcycle, gstat,
+        vsite, constr, fcd, graph,
+        mdAtoms, fr, ppForceWorkload, enerd
+    };
+    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS          = true;
+    auto state_work_x = makeArrayRef(state_work.s.x);
+    auto state_work_f = makeArrayRef(state_work.f);
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            int64_t     step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work_x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work_x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work_x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    relax_shell_flexcon(fplog,
+                                        cr,
+                                        ms,
+                                        mdrunOptions.verbose,
+                                        nullptr,
+                                        step,
+                                        inputrec,
+                                        bNS,
+                                        force_flags,
+                                        top,
+                                        constr,
+                                        enerd,
+                                        fcd,
+                                        &state_work.s,
+                                        state_work.f.arrayRefWithPadding(),
+                                        vir,
+                                        mdatoms,
+                                        nrnb,
+                                        wcycle,
+                                        graph,
+                                        &top_global->groups,
+                                        shellfc,
+                                        fr,
+                                        ppForceWorkload,
+                                        t,
+                                        mu_tot,
+                                        vsite,
+                                        DdOpenBalanceRegionBeforeForceComputation::no,
+                                        DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    std::copy(state_work_f.begin(), state_work_f.begin()+atom_index.size(), fneg.begin());
+                }
+            }
+
+            /* x is restored to original */
+            state_work_x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work_f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && aid+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (aid + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0306c64f05d0c66387e05b8fe68c4c96fef437c
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp
@@ -0,0 +1,1487 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements the replica exchange routines.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "replicaexchange.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/collect.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+//! Helps cut off probability values.
+constexpr int c_probabilityCutoff = 100;
+
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulation
+#define MSRANK(ms, nodeid)  (nodeid)
+
+//! Enum for replica exchange flavours
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+/*! \brief Strings describing replica exchange flavours.
+ *
+ *  end_single_marker merely notes the end of single variable replica
+ *  exchange. All types higher than it are multiple replica exchange
+ *  methods.
+ *
+ * Eventually, should add 'pressure', 'temperature and pressure',
+ *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
+ *  until we feel better about the pressure control methods giving
+ *  exact ensembles.  Right now, we assume constant pressure */
+static const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+
+//! Working data for replica exchange.
+struct gmx_repl_ex
+{
+    //! Replica ID
+    int       repl;
+    //! Total number of replica
+    int       nrepl;
+    //! Temperature
+    real      temp;
+    //! Replica exchange type from ere enum
+    int       type;
+    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
+    real    **q;
+    //! Use constant pressure and temperature
+    gmx_bool  bNPT;
+    //! Replica pressures
+    real     *pres;
+    //! Replica indices
+    int      *ind;
+    //! Used for keeping track of all the replica swaps
+    int      *allswaps;
+    //! Replica exchange interval (number of steps)
+    int       nst;
+    //! Number of exchanges per interval
+    int       nex;
+    //! Random seed
+    int       seed;
+    //! Number of even and odd replica change attempts
+    int       nattempt[2];
+    //! Sum of probabilities
+    real     *prob_sum;
+    //! Number of moves between replicas i and j
+    int     **nmoves;
+    //! i-th element of the array is the number of exchanges between replica i-1 and i
+    int      *nexchange;
+
+    /*! \brief Helper arrays for replica exchange; allocated here
+     * so they don't have to be allocated each time */
+    //! \{
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+    //! \}
+
+    //! Helper arrays to hold the quantities that are exchanged.
+    //! \{
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+    //! \}
+};
+
+// TODO We should add Doxygen here some time.
+//! \cond
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    /* PLUMED */
+    //bDiff = FALSE;
+    //for (s = 1; s < ms->nsim; s++)
+    //{
+    //    if (qall[s] != qall[0])
+    //    {
+              bDiff = TRUE;
+    //    }
+    //}
+    /* END PLUMED */
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (!isMultiSim(ms) || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multidir option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from isMultiSim(ms). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    /* PLUMED */
+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
+    // in those cases replicas can share the same temperature.
+    /*
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {*/
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                 /*
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+    */
+    /* END PLUMED */
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (isMasterSim(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+/* PLUMED HREX */
+void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+/* END PLUMED HREX */
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
+    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
+}
+
+/* PLUMED HREX */
+void copy_state_serial(const t_state *src, t_state *dest)
+/* END PLUMED HREX */
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
+{
+    for (auto &v : velocities)
+    {
+        v *= fac;
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, const int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, const gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+/* PLUMED HREX */
+/* this is necessary because with plumed HREX the energy contribution is
+   already taken into account */
+    if(plumed_hrex) delta=0.0;
+/* END PLUMED HREX */
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          int64_t               step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[static_cast<int>(re->q[ereLAMBDA][i])+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    /* PLUMED */
+    int plumed_test_exchange_pattern=0;
+    if(plumed_test_exchange_pattern && plumed_hrex) gmx_fatal(FARGS,"hrex not compatible with ad hoc exchange patterns");
+    /* END PLUMED */
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > c_probabilityCutoff)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        /* PLUMED */
+        if(plumedswitch){
+          int partner=re->repl;
+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
+          if(plumed_test_exchange_pattern>0){
+            int *list;
+            snew(list,re->nrepl);
+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
+            plumed_cmd(plumedmain,"getExchangesList",list);
+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
+            sfree(list);
+          }
+
+          for(i=1; i<re->nrepl; i++) {
+            if (i % 2 != m) continue;
+            a = re->ind[i-1];
+            b = re->ind[i];
+            if(re->repl==a) partner=b;
+            if(re->repl==b) partner=a;
+          }
+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
+          plumed_cmd(plumedmain,"GREX calculate",NULL);
+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
+        }
+        /* END PLUMED */
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                /* PLUMED */
+                if(plumedswitch){
+                  real adb,bdb,dplumed;
+                  char buf[300];
+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
+                  delta+=dplumed;
+                  if (bPrint)
+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
+                }
+                /* END PLUMED */
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > c_probabilityCutoff)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                  /* PLUMED */
+                  if(!plumed_test_exchange_pattern) {
+                    /* standard neighbour swapping */
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  } else {
+                    /* alternative swapping patterns */
+                    tmp       = pind[a];
+                    pind[a]   = pind[b];
+                    pind[b]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  }
+                  /* END PLUMED */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* PLUMED */
+    if(plumed_test_exchange_pattern>0) {
+      for (i = 0; i < re->nrepl; i++)
+      {
+          re->ind[i] = i;
+      }
+    }
+    /* END PLUMED */
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr,
+                          const gmx_multisim_t *ms, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    /* PLUMED */
+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
+    /* END PLUMED */
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state->v,
+                                 std::sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  (static_cast<real>(re->nexchange[i]))/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
+
+/* PLUMED HREX */
+int replica_exchange_get_repl(const gmx_repl_ex_t re){
+  return re->repl;
+};
+
+int replica_exchange_get_nrepl(const gmx_repl_ex_t re){
+  return re->nrepl;
+};
+/* END PLUMED HREX */
+//! \endcond
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a633d688c67c1755effb063557635296875bc876
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
@@ -0,0 +1,1383 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements the replica exchange routines.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "replicaexchange.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/collect.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+//! Helps cut off probability values.
+constexpr int c_probabilityCutoff = 100;
+
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulation
+#define MSRANK(ms, nodeid)  (nodeid)
+
+//! Enum for replica exchange flavours
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+/*! \brief Strings describing replica exchange flavours.
+ *
+ *  end_single_marker merely notes the end of single variable replica
+ *  exchange. All types higher than it are multiple replica exchange
+ *  methods.
+ *
+ * Eventually, should add 'pressure', 'temperature and pressure',
+ *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
+ *  until we feel better about the pressure control methods giving
+ *  exact ensembles.  Right now, we assume constant pressure */
+static const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+
+//! Working data for replica exchange.
+struct gmx_repl_ex
+{
+    //! Replica ID
+    int       repl;
+    //! Total number of replica
+    int       nrepl;
+    //! Temperature
+    real      temp;
+    //! Replica exchange type from ere enum
+    int       type;
+    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
+    real    **q;
+    //! Use constant pressure and temperature
+    gmx_bool  bNPT;
+    //! Replica pressures
+    real     *pres;
+    //! Replica indices
+    int      *ind;
+    //! Used for keeping track of all the replica swaps
+    int      *allswaps;
+    //! Replica exchange interval (number of steps)
+    int       nst;
+    //! Number of exchanges per interval
+    int       nex;
+    //! Random seed
+    int       seed;
+    //! Number of even and odd replica change attempts
+    int       nattempt[2];
+    //! Sum of probabilities
+    real     *prob_sum;
+    //! Number of moves between replicas i and j
+    int     **nmoves;
+    //! i-th element of the array is the number of exchanges between replica i-1 and i
+    int      *nexchange;
+
+    /*! \brief Helper arrays for replica exchange; allocated here
+     * so they don't have to be allocated each time */
+    //! \{
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+    //! \}
+
+    //! Helper arrays to hold the quantities that are exchanged.
+    //! \{
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+    //! \}
+};
+
+// TODO We should add Doxygen here some time.
+//! \cond
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    bDiff = FALSE;
+    for (s = 1; s < ms->nsim; s++)
+    {
+        if (qall[s] != qall[0])
+        {
+            bDiff = TRUE;
+        }
+    }
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (!isMultiSim(ms) || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multidir option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from isMultiSim(ms). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (isMasterSim(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
+    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
+}
+
+static void copy_state_serial(const t_state *src, t_state *dest)
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
+{
+    for (auto &v : velocities)
+    {
+        v *= fac;
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, const int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, const gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          int64_t               step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[static_cast<int>(re->q[ereLAMBDA][i])+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > c_probabilityCutoff)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > c_probabilityCutoff)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr,
+                          const gmx_multisim_t *ms, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state->v,
+                                 std::sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  (static_cast<real>(re->nexchange[i]))/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
+
+//! \endcond
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff7c22ff9ac47719ec2a283253792f5dfec4e1d
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h
@@ -0,0 +1,116 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routines for replica exchange.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
+#define GMX_MDRUN_REPLICAEXCHANGE_H
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/*! \libinternal
+ * \brief The parameters for the replica exchange algorithm. */
+struct ReplicaExchangeParameters
+{
+    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
+    int exchangeInterval = 0;
+    //! The number of exchanges to attempt at an exchange step.
+    int numExchanges = 0;
+    //! The random seed, -1 means generate a seed.
+    int randomSeed = -1;
+};
+
+//! Abstract type for replica exchange
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+/*! \brief Setup function.
+ *
+ * Should only be called on the master ranks */
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+
+/*! \brief Attempts replica exchange.
+ *
+ * Should be called on all ranks.  When running each replica in
+ * parallel, this routine collects the state on the master rank before
+ * exchange.  With domain decomposition, the global state after
+ * exchange is stored in state and still needs to be redistributed
+ * over the ranks.
+ *
+ * \returns TRUE if the state has been exchanged.
+ */
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          const gmx_multisim_t *ms,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          int64_t step, real time);
+
+/*! \brief Prints replica exchange statistics to the log file.
+ *
+ * Should only be called on the master ranks */
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+
+/* PLUMED HREX */
+extern int replica_exchange_get_repl(const gmx_repl_ex_t re);
+extern int replica_exchange_get_nrepl(const gmx_repl_ex_t re);
+extern void pd_collect_state(const t_commrec *cr, t_state *state);
+extern void exchange_state(const gmx_multisim_t *ms, int b, t_state *state);
+extern void copy_state_serial(const t_state *src, t_state *dest);
+/* END PLUMED HREX */
+
+#endif
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..8f4211febe707fc6d474d2a5f840c2a2c79ca0de
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
@@ -0,0 +1,108 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routines for replica exchange.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
+#define GMX_MDRUN_REPLICAEXCHANGE_H
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/*! \libinternal
+ * \brief The parameters for the replica exchange algorithm. */
+struct ReplicaExchangeParameters
+{
+    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
+    int exchangeInterval = 0;
+    //! The number of exchanges to attempt at an exchange step.
+    int numExchanges = 0;
+    //! The random seed, -1 means generate a seed.
+    int randomSeed = -1;
+};
+
+//! Abstract type for replica exchange
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+/*! \brief Setup function.
+ *
+ * Should only be called on the master ranks */
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+
+/*! \brief Attempts replica exchange.
+ *
+ * Should be called on all ranks.  When running each replica in
+ * parallel, this routine collects the state on the master rank before
+ * exchange.  With domain decomposition, the global state after
+ * exchange is stored in state and still needs to be redistributed
+ * over the ranks.
+ *
+ * \returns TRUE if the state has been exchanged.
+ */
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          const gmx_multisim_t *ms,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          int64_t step, real time);
+
+/*! \brief Prints replica exchange statistics to the log file.
+ *
+ * Should only be called on the master ranks */
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+
+#endif
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d206adf79a142aeb3bddc42b5e8e69c81dff63b
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp
@@ -0,0 +1,1956 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/gmxfio.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/gpubonded.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/boxdeformation.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/makeconstraints.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/ppforceworkload.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdrun/legacymdrunoptions.h"
+#include "gromacs/mdrun/logging.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/restraint/manager.h"
+#include "gromacs/restraint/restraintmdmodule.h"
+#include "gromacs/restraint/restraintpotential.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+namespace gmx
+{
+
+/*! \brief Barrier for safe simultaneous thread access to mdrunner data
+ *
+ * Used to ensure that the master thread does not modify mdrunner during copy
+ * on the spawned threads. */
+static void threadMpiMdrunnerAccessBarrier()
+{
+#if GMX_THREAD_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+Mdrunner Mdrunner::cloneOnSpawnedThread() const
+{
+    auto newRunner = Mdrunner();
+
+    // All runners in the same process share a restraint manager resource because it is
+    // part of the interface to the client code, which is associated only with the
+    // original thread. Handles to the same resources can be obtained by copy.
+    {
+        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+    }
+
+    // Copy original cr pointer before master thread can pass the thread barrier
+    newRunner.cr  = reinitialize_commrec_for_this_thread(cr);
+
+    // Copy members of master runner.
+    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
+    // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
+    newRunner.hw_opt    = hw_opt;
+    newRunner.filenames = filenames;
+
+    newRunner.oenv                = oenv;
+    newRunner.mdrunOptions        = mdrunOptions;
+    newRunner.domdecOptions       = domdecOptions;
+    newRunner.nbpu_opt            = nbpu_opt;
+    newRunner.pme_opt             = pme_opt;
+    newRunner.pme_fft_opt         = pme_fft_opt;
+    newRunner.bonded_opt          = bonded_opt;
+    newRunner.nstlist_cmdline     = nstlist_cmdline;
+    newRunner.replExParams        = replExParams;
+    newRunner.pforce              = pforce;
+    newRunner.ms                  = ms;
+    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+
+    threadMpiMdrunnerAccessBarrier();
+
+    GMX_RELEASE_ASSERT(!MASTER(newRunner.cr), "cloneOnSpawnedThread should only be called on spawned threads");
+
+    return newRunner;
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(const void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course; fnm, cr and fplog
+           are reset in the call below, all others should be const. */
+        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+
+    threadMpiMdrunnerAccessBarrier();
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}  // namespace gmx
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    int64_t              nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    t_nrnb                   *nrnb;
+    t_forcerec               *fr               = nullptr;
+    t_fcdata                 *fcd              = nullptr;
+    real                      ewaldcoeff_q     = 0;
+    real                      ewaldcoeff_lj    = 0;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    int64_t                   reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    std::unique_ptr<gmx::MDModules> mdModules(new gmx::MDModules);
+    t_inputrec                      inputrecInstance;
+    t_inputrec                     *inputrec = &inputrecInstance;
+    gmx_mtop_t                      mtop;
+
+    bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    auto       bondedTarget    = findTaskTarget(bonded_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+
+    FILE *fplog = nullptr;
+    // If we are appending, we don't write log output because we need
+    // to check that the old log file matches what the checkpoint file
+    // expects. Otherwise, we should start to write log output now if
+    // there is a file ready for it.
+    if (logFileHandle != nullptr && !mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = gmx_fio_getfp(logFileHandle);
+    }
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    // TODO The thread-MPI master rank makes a working
+    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+    // after the threads have been launched. This works because no use
+    // is made of that communicator until after the execution paths
+    // have rejoined. But it is likely that we can improve the way
+    // this is expressed, e.g. by expressly running detection only the
+    // master rank for thread-MPI, rather than relying on the mutex
+    // and reference count.
+    PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
+
+    gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+        writeSourceDoi(fplog);
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = compat::make_unique<t_state>();
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
+
+        /* In rerun, set velocities to zero if present */
+        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        {
+            // rerun does not use velocities
+            GMX_LOG(mdlog.info).asParagraph().appendText(
+                    "Rerun trajectory contains velocities. Rerun does only evaluate "
+                    "potential energy and forces. The velocities will be ignored.");
+            for (int i = 0; i < globalState->natoms; i++)
+            {
+                clear_rvec(globalState->v[i]);
+            }
+            globalState->flags &= ~(1 << estV);
+        }
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    canUseGpuForNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, &mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+        physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    }
+    // END OF CAUTION: cr and physicalNodeComm are now reliable
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, &mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    //
+    // Note that when bonded interactions run on a GPU they always run
+    // alongside a nonbonded task, so do not influence task assignment
+    // even though they affect the force calculation workload.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    bool useGpuForBonded    = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
+        bool usingVerletScheme     = inputrec->cutoff_scheme == ecutsVERLET;
+        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded,
+                                                                canUseGpuForNonbonded,
+                                                                usingVerletScheme,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    *hwinfo, *inputrec, mtop,
+                                                    cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+        auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
+        useGpuForBonded =
+            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme, usingVerletScheme,
+                                            bondedTarget, canUseGpuForBonded,
+                                            EVDW_PME(inputrec->vdwtype),
+                                            EEL_PME_EWALD(inputrec->coulombtype),
+                                            domdecOptions.numPmeRanks, gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // Build restraints.
+    // TODO: hide restraint implementation details from Mdrunner.
+    // There is nothing unique about restraints at this point as far as the
+    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
+    // factory functions from the SimulationContext on which to call mdModules->add().
+    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
+    for (auto && restraint : restraintManager_->getRestraints())
+    {
+        auto module = RestraintMDModule::create(restraint,
+                                                restraint->sites());
+        mdModules->add(std::move(module));
+    }
+
+    // TODO: Error handling
+    mdModules->assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = compat::make_unique<t_state>();
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
+#endif
+#endif
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#if GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
+
+    auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
+                        logFileHandle,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+
+        if (continuationOptions.appendFiles && logFileHandle)
+        {
+            // Now we can start normal logging to the truncated log file.
+            fplog    = gmx_fio_getfp(logFileHandle);
+            prepareLogAppending(fplog);
+            logOwner = buildLogger(fplog, cr);
+            mdlog    = logOwner.logger();
+        }
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    LocalAtomSetManager atomSets;
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
+                                           &mtop, inputrec,
+                                           box, positionsFromStatePointer(globalState.get()),
+                                           &atomSets);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=screw is only implemented with domain decomposition");
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+#if GMX_MPI
+    if (isMultiSim(ms))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                ms->sim, ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+                                            pmeRunMode, mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          physicalNodeComm.size_,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    // Enable FP exception detection for the Verlet scheme, but not in
+    // Release mode and not for compilers with known buggy FP
+    // exception support (clang with any optimization) or suspected
+    // buggy FP exception support (gcc 7.* with optimization).
+#if !defined NDEBUG && \
+    !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
+    && defined __OPTIMIZE__)
+    const bool bEnableFPE = inputrec->cutoff_scheme == ecutsVERLET;
+#else
+    const bool bEnableFPE = false;
+#endif
+    //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
+    if (bEnableFPE)
+    {
+        gmx_feenableexcept();
+    }
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become known.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            // Note that any bonded tasks on a GPU always accompany a
+            // non-bonded task.
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+            else if (bondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank,
+                                              useGpuForBonded, pmeRunMode);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (isMultiSim(ms))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    std::unique_ptr<ClfftInitializer> initializedClfftLibrary;
+
+    gmx_device_info_t                *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
+    // This works because only one task of each type is currently permitted.
+    auto                 pmeGpuTaskMapping     = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    const bool           thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
+    if (thisRankHasPmeGpuTask)
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+        // TODO It would be nice to move this logic into the factory
+        // function. See Redmine #2535.
+        bool isMasterThread = !GMX_THREAD_MPI || MASTER(cr);
+        if (pmeRunMode == PmeRunMode::GPU && !initializedClfftLibrary && isMasterThread)
+        {
+            initializedClfftLibrary = initializeClfftLibrary();
+        }
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
+                                  *hwinfo->hardwareTopology,
+                                  physicalNodeComm, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        int numThreadsOnThisNode, intraNodeThreadOffset;
+        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
+                                 &intraNodeThreadOffset);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, numThreadsOnThisNode,
+                                intraNodeThreadOffset, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
+                             &mdrunOptions
+                                 .checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms>     mdAtoms;
+    std::unique_ptr<gmx_vsite_t> vsite;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules->initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, &mtop, cr, box,
+                      opt2fn("-table", filenames.size(), filenames.data()),
+                      opt2fn("-tablep", filenames.size(), filenames.data()),
+                      opt2fns("-tableb", filenames.size(), filenames.data()),
+                      *hwinfo, nonbondedDeviceInfo,
+                      useGpuForBonded,
+                      FALSE,
+                      pforce);
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
+        if (globalState && thisRankHasPmeGpuTask)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr,
+                                       getNumPmeDomains(cr->dd),
+                                       inputrec,
+                                       mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec,
+                          &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
+            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
+            {
+                initPullHistory(inputrec->pull_work, &observablesHistory);
+            }
+            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
+            {
+                init_pull_output_files(inputrec->pull_work,
+                                       filenames.size(), filenames.data(), oenv,
+                                       continuationOptions);
+            }
+        }
+
+        std::unique_ptr<EnforcedRotation> enforcedRotation;
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            enforcedRotation = init_rot(fplog,
+                                        inputrec,
+                                        filenames.size(),
+                                        filenames.data(),
+                                        cr,
+                                        &atomSets,
+                                        globalState.get(),
+                                        &mtop,
+                                        oenv,
+                                        mdrunOptions);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, inputrec, opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
+                            &mtop, globalState.get(), &observablesHistory,
+                            cr, &atomSets, oenv, mdrunOptions);
+        }
+
+        /* Let makeConstraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
+                                    || observablesHistory.edsamHistory);
+        auto constr              = makeConstraints(mtop, *inputrec, doEssentialDynamics,
+                                                   fplog, *mdAtoms->mdatoms(),
+                                                   cr, ms, nrnb, wcycle, fr->bMolPBC);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work. Later, it should probably be made/updated
+        // after the workload for the lifetime of a PP domain is
+        // understood.
+        PpForceWorkload ppForceWorkload;
+
+        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to integrator.");
+
+        /* PLUMED */
+        if(plumedswitch){
+          /* detect plumed API version */
+          int pversion=0;
+          plumed_cmd(plumedmain,"getApiVersion",&pversion);
+          if(pversion>5) {
+             int nth = gmx_omp_nthreads_get(emntDefault);
+             if(pversion>5) plumed_cmd(plumedmain,"setNumOMPthreads",&nth);
+          }
+        }
+        /* END PLUMED */
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        Integrator integrator {
+            fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
+            oenv,
+            mdrunOptions,
+            vsite.get(), constr.get(),
+            enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
+            deform.get(),
+            mdModules->outputProvider(),
+            inputrec, &mtop,
+            fcd,
+            globalState.get(),
+            &observablesHistory,
+            mdAtoms.get(), nrnb, wcycle, fr,
+            &ppForceWorkload,
+            replExParams,
+            membed,
+            walltime_accounting,
+            std::move(stopHandlerBuilder_)
+        };
+        integrator.run(inputrec->eI, doRerun);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+    mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, physicalNodeComm);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+    done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr);
+    sfree(fcd);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+    sfree(nrnb);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    // Ensure log file content is written
+    if (logFileHandle)
+    {
+        gmx_fio_flush(logFileHandle);
+    }
+
+    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
+     * exceptions were enabled before function was called. */
+    if (bEnableFPE)
+    {
+        gmx_fedisableexcept();
+    }
+
+    rc = static_cast<int>(gmx_get_stop_condition());
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        done_commrec(cr);
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+Mdrunner::~Mdrunner()
+{
+    // Clean up of the Manager.
+    // This will end up getting called on every thread-MPI rank, which is unnecessary,
+    // but okay as long as threads synchronize some time before adding or accessing
+    // a new set of restraints.
+    if (restraintManager_)
+    {
+        restraintManager_->clear();
+        GMX_ASSERT(restraintManager_->countRestraints() == 0,
+                   "restraints added during runner life time should be cleared at runner destruction.");
+    }
+};
+
+void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
+                            std::string                               name)
+{
+    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
+    // Not sure if this should be logged through the md logger or something else,
+    // but it is helpful to have some sort of INFO level message sent somewhere.
+    //    std::cout << "Registering restraint named " << name << std::endl;
+
+    // When multiple restraints are used, it may be wasteful to register them separately.
+    // Maybe instead register an entire Restraint Manager as a force provider.
+    restraintManager_->addToSpec(std::move(puller),
+                                 std::move(name));
+}
+
+Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
+
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
+Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
+
+class Mdrunner::BuilderImplementation
+{
+    public:
+        BuilderImplementation() = delete;
+        explicit BuilderImplementation(SimulationContext* context);
+        ~BuilderImplementation();
+
+        BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
+                                                    real                forceWarningThreshold);
+
+        void addDomdec(const DomdecOptions &options);
+
+        void addVerletList(int nstlist);
+
+        void addReplicaExchange(const ReplicaExchangeParameters &params);
+
+        void addMultiSim(gmx_multisim_t* multisim);
+
+        void addNonBonded(const char* nbpu_opt);
+
+        void addPME(const char* pme_opt_, const char* pme_fft_opt_);
+
+        void addBondedTaskAssignment(const char* bonded_opt);
+
+        void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
+
+        void addFilenames(ArrayRef <const t_filenm> filenames);
+
+        void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
+
+        void addLogFile(t_fileio *logFileHandle);
+
+        void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
+
+        Mdrunner build();
+
+    private:
+        // Default parameters copied from runner.h
+        // \todo Clarify source(s) of default parameters.
+
+        const char* nbpu_opt_    = nullptr;
+        const char* pme_opt_     = nullptr;
+        const char* pme_fft_opt_ = nullptr;
+        const char *bonded_opt_  = nullptr;
+
+        MdrunOptions                          mdrunOptions_;
+
+        DomdecOptions                         domdecOptions_;
+
+        ReplicaExchangeParameters             replicaExchangeParameters_;
+
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int         nstlist_ = 0;
+
+        //! Non-owning multisim communicator handle.
+        std::unique_ptr<gmx_multisim_t*>      multisim_ = nullptr;
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real forceWarningThreshold_ = -1;
+
+        /*! \brief  Non-owning pointer to SimulationContext (owned and managed by client)
+         *
+         * \internal
+         * \todo Establish robust protocol to make sure resources remain valid.
+         * SimulationContext will likely be separated into multiple layers for
+         * different levels of access and different phases of execution. Ref
+         * https://redmine.gromacs.org/issues/2375
+         * https://redmine.gromacs.org/issues/2587
+         */
+        SimulationContext* context_ = nullptr;
+
+        //! \brief Parallelism information.
+        gmx_hw_opt_t hardwareOptions_;
+
+        //! filename options for simulation.
+        ArrayRef<const t_filenm> filenames_;
+
+        /*! \brief Handle to output environment.
+         *
+         * \todo gmx_output_env_t needs lifetime management.
+         */
+        gmx_output_env_t*    outputEnvironment_ = nullptr;
+
+        /*! \brief Non-owning handle to MD log file.
+         *
+         * \todo Context should own output facilities for client.
+         * \todo Improve log file handle management.
+         * \internal
+         * Code managing the FILE* relies on the ability to set it to
+         * nullptr to check whether the filehandle is valid.
+         */
+        t_fileio* logFileHandle_ = nullptr;
+
+        /*!
+         * \brief Builder for simulation stop signal handler.
+         */
+        std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+};
+
+Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* context) :
+    context_(context)
+{
+    GMX_ASSERT(context_, "Bug found. It should not be possible to construct builder without a valid context.");
+}
+
+Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
+
+Mdrunner::BuilderImplementation &
+Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    mdrunOptions_          = options;
+    forceWarningThreshold_ = forceWarningThreshold;
+    return *this;
+}
+
+void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
+{
+    domdecOptions_ = options;
+}
+
+void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
+{
+    nstlist_ = nstlist;
+}
+
+void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    replicaExchangeParameters_ = params;
+}
+
+void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
+{
+    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+}
+
+Mdrunner Mdrunner::BuilderImplementation::build()
+{
+    auto newRunner = Mdrunner();
+
+    GMX_ASSERT(context_, "Bug found. It should not be possible to call build() without a valid context.");
+
+    newRunner.mdrunOptions    = mdrunOptions_;
+    newRunner.domdecOptions   = domdecOptions_;
+
+    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
+    newRunner.hw_opt          = hardwareOptions_;
+
+    // No invariant to check. This parameter exists to optionally override other behavior.
+    newRunner.nstlist_cmdline = nstlist_;
+
+    newRunner.replExParams    = replicaExchangeParameters_;
+
+    newRunner.filenames = filenames_;
+
+    GMX_ASSERT(context_->communicationRecord_, "SimulationContext communications not initialized.");
+    newRunner.cr = context_->communicationRecord_;
+
+    if (multisim_)
+    {
+        // nullptr is a valid value for the multisim handle, so we don't check the pointed-to pointer.
+        newRunner.ms = *multisim_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addMultiSim() is required before build()"));
+    }
+
+    // \todo Clarify ownership and lifetime management for gmx_output_env_t
+    // \todo Update sanity checking when output environment has clearly specified invariants.
+    // Initialization and default values for oenv are not well specified in the current version.
+    if (outputEnvironment_)
+    {
+        newRunner.oenv  = outputEnvironment_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
+    }
+
+    newRunner.logFileHandle = logFileHandle_;
+
+    if (nbpu_opt_)
+    {
+        newRunner.nbpu_opt    = nbpu_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
+    }
+
+    if (pme_opt_ && pme_fft_opt_)
+    {
+        newRunner.pme_opt     = pme_opt_;
+        newRunner.pme_fft_opt = pme_fft_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
+    }
+
+    if (bonded_opt_)
+    {
+        newRunner.bonded_opt = bonded_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
+    }
+
+    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+
+    if (stopHandlerBuilder_)
+    {
+        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
+    }
+    else
+    {
+        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+    }
+
+    return newRunner;
+}
+
+void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
+{
+    nbpu_opt_ = nbpu_opt;
+}
+
+void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
+                                             const char* pme_fft_opt)
+{
+    pme_opt_     = pme_opt;
+    pme_fft_opt_ = pme_fft_opt;
+}
+
+void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
+{
+    bonded_opt_ = bonded_opt;
+}
+
+void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    hardwareOptions_ = hardwareOptions;
+}
+
+void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    filenames_ = filenames;
+}
+
+void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    outputEnvironment_ = outputEnvironment;
+}
+
+void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
+{
+    logFileHandle_ = logFileHandle;
+}
+
+void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    stopHandlerBuilder_ = std::move(builder);
+}
+
+MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
+    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+{
+}
+
+MdrunnerBuilder::~MdrunnerBuilder() = default;
+
+MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    impl_->setExtraMdrunOptions(options, forceWarningThreshold);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
+{
+    impl_->addDomdec(options);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
+{
+    impl_->addVerletList(nstlist);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    impl_->addReplicaExchange(params);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addMultiSim(gmx_multisim_t* multisim)
+{
+    impl_->addMultiSim(multisim);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
+{
+    impl_->addNonBonded(nbpu_opt);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
+                                                    const char* pme_fft_opt)
+{
+    // The builder method may become more general in the future, but in this version,
+    // parameters for PME electrostatics are both required and the only parameters
+    // available.
+    if (pme_opt && pme_fft_opt)
+    {
+        impl_->addPME(pme_opt, pme_fft_opt);
+    }
+    else
+    {
+        GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
+    }
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
+{
+    impl_->addBondedTaskAssignment(bonded_opt);
+    return *this;
+}
+
+Mdrunner MdrunnerBuilder::build()
+{
+    return impl_->build();
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    impl_->addHardwareOptions(hardwareOptions);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    impl_->addFilenames(filenames);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    impl_->addOutputEnvironment(outputEnvironment);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
+{
+    impl_->addLogFile(logFileHandle);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    impl_->addStopHandlerBuilder(std::move(builder));
+    return *this;
+}
+
+MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
+
+MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
+
+} // namespace gmx
diff --git a/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp.preplumed b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..ae158a4984deefd1d61ca9e2a912143db1c75e05
--- /dev/null
+++ b/patches/gromacs-2019.1.diff/src/gromacs/mdrun/runner.cpp.preplumed
@@ -0,0 +1,1932 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/gmxfio.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/gpubonded.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/boxdeformation.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/makeconstraints.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/membed.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/ppforceworkload.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdrun/legacymdrunoptions.h"
+#include "gromacs/mdrun/logging.h"
+#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/output.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/restraint/manager.h"
+#include "gromacs/restraint/restraintmdmodule.h"
+#include "gromacs/restraint/restraintpotential.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "integrator.h"
+#include "replicaexchange.h"
+
+#if GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+namespace gmx
+{
+
+/*! \brief Barrier for safe simultaneous thread access to mdrunner data
+ *
+ * Used to ensure that the master thread does not modify mdrunner during copy
+ * on the spawned threads. */
+static void threadMpiMdrunnerAccessBarrier()
+{
+#if GMX_THREAD_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+Mdrunner Mdrunner::cloneOnSpawnedThread() const
+{
+    auto newRunner = Mdrunner();
+
+    // All runners in the same process share a restraint manager resource because it is
+    // part of the interface to the client code, which is associated only with the
+    // original thread. Handles to the same resources can be obtained by copy.
+    {
+        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+    }
+
+    // Copy original cr pointer before master thread can pass the thread barrier
+    newRunner.cr  = reinitialize_commrec_for_this_thread(cr);
+
+    // Copy members of master runner.
+    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
+    // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
+    newRunner.hw_opt    = hw_opt;
+    newRunner.filenames = filenames;
+
+    newRunner.oenv                = oenv;
+    newRunner.mdrunOptions        = mdrunOptions;
+    newRunner.domdecOptions       = domdecOptions;
+    newRunner.nbpu_opt            = nbpu_opt;
+    newRunner.pme_opt             = pme_opt;
+    newRunner.pme_fft_opt         = pme_fft_opt;
+    newRunner.bonded_opt          = bonded_opt;
+    newRunner.nstlist_cmdline     = nstlist_cmdline;
+    newRunner.replExParams        = replExParams;
+    newRunner.pforce              = pforce;
+    newRunner.ms                  = ms;
+    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+
+    threadMpiMdrunnerAccessBarrier();
+
+    GMX_RELEASE_ASSERT(!MASTER(newRunner.cr), "cloneOnSpawnedThread should only be called on spawned threads");
+
+    return newRunner;
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(const void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course; fnm, cr and fplog
+           are reset in the call below, all others should be const. */
+        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+
+    threadMpiMdrunnerAccessBarrier();
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}  // namespace gmx
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    int64_t              nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    t_nrnb                   *nrnb;
+    t_forcerec               *fr               = nullptr;
+    t_fcdata                 *fcd              = nullptr;
+    real                      ewaldcoeff_q     = 0;
+    real                      ewaldcoeff_lj    = 0;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    int64_t                   reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    std::unique_ptr<gmx::MDModules> mdModules(new gmx::MDModules);
+    t_inputrec                      inputrecInstance;
+    t_inputrec                     *inputrec = &inputrecInstance;
+    gmx_mtop_t                      mtop;
+
+    bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    auto       bondedTarget    = findTaskTarget(bonded_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+
+    FILE *fplog = nullptr;
+    // If we are appending, we don't write log output because we need
+    // to check that the old log file matches what the checkpoint file
+    // expects. Otherwise, we should start to write log output now if
+    // there is a file ready for it.
+    if (logFileHandle != nullptr && !mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = gmx_fio_getfp(logFileHandle);
+    }
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    // TODO The thread-MPI master rank makes a working
+    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+    // after the threads have been launched. This works because no use
+    // is made of that communicator until after the execution paths
+    // have rejoined. But it is likely that we can improve the way
+    // this is expressed, e.g. by expressly running detection only the
+    // master rank for thread-MPI, rather than relying on the mutex
+    // and reference count.
+    PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
+
+    gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+        writeSourceDoi(fplog);
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = compat::make_unique<t_state>();
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
+
+        /* In rerun, set velocities to zero if present */
+        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        {
+            // rerun does not use velocities
+            GMX_LOG(mdlog.info).asParagraph().appendText(
+                    "Rerun trajectory contains velocities. Rerun does only evaluate "
+                    "potential energy and forces. The velocities will be ignored.");
+            for (int i = 0; i < globalState->natoms; i++)
+            {
+                clear_rvec(globalState->v[i]);
+            }
+            globalState->flags &= ~(1 << estV);
+        }
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    canUseGpuForNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, &mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+        physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    }
+    // END OF CAUTION: cr and physicalNodeComm are now reliable
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, &mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    //
+    // Note that when bonded interactions run on a GPU they always run
+    // alongside a nonbonded task, so do not influence task assignment
+    // even though they affect the force calculation workload.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    bool useGpuForBonded    = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
+        bool usingVerletScheme     = inputrec->cutoff_scheme == ecutsVERLET;
+        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded,
+                                                                canUseGpuForNonbonded,
+                                                                usingVerletScheme,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    *hwinfo, *inputrec, mtop,
+                                                    cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+        auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
+        useGpuForBonded =
+            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme, usingVerletScheme,
+                                            bondedTarget, canUseGpuForBonded,
+                                            EVDW_PME(inputrec->vdwtype),
+                                            EEL_PME_EWALD(inputrec->coulombtype),
+                                            domdecOptions.numPmeRanks, gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // Build restraints.
+    // TODO: hide restraint implementation details from Mdrunner.
+    // There is nothing unique about restraints at this point as far as the
+    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
+    // factory functions from the SimulationContext on which to call mdModules->add().
+    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
+    for (auto && restraint : restraintManager_->getRestraints())
+    {
+        auto module = RestraintMDModule::create(restraint,
+                                                restraint->sites());
+        mdModules->add(std::move(module));
+    }
+
+    // TODO: Error handling
+    mdModules->assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = compat::make_unique<t_state>();
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
+#endif
+#endif
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#if GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
+
+    auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
+                        logFileHandle,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+
+        if (continuationOptions.appendFiles && logFileHandle)
+        {
+            // Now we can start normal logging to the truncated log file.
+            fplog    = gmx_fio_getfp(logFileHandle);
+            prepareLogAppending(fplog);
+            logOwner = buildLogger(fplog, cr);
+            mdlog    = logOwner.logger();
+        }
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    LocalAtomSetManager atomSets;
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
+                                           &mtop, inputrec,
+                                           box, positionsFromStatePointer(globalState.get()),
+                                           &atomSets);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=screw is only implemented with domain decomposition");
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+#if GMX_MPI
+    if (isMultiSim(ms))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                ms->sim, ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+                                            pmeRunMode, mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          physicalNodeComm.size_,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    // Enable FP exception detection for the Verlet scheme, but not in
+    // Release mode and not for compilers with known buggy FP
+    // exception support (clang with any optimization) or suspected
+    // buggy FP exception support (gcc 7.* with optimization).
+#if !defined NDEBUG && \
+    !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
+    && defined __OPTIMIZE__)
+    const bool bEnableFPE = inputrec->cutoff_scheme == ecutsVERLET;
+#else
+    const bool bEnableFPE = false;
+#endif
+    //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
+    if (bEnableFPE)
+    {
+        gmx_feenableexcept();
+    }
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become known.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            // Note that any bonded tasks on a GPU always accompany a
+            // non-bonded task.
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+            else if (bondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank,
+                                              useGpuForBonded, pmeRunMode);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (isMultiSim(ms))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    std::unique_ptr<ClfftInitializer> initializedClfftLibrary;
+
+    gmx_device_info_t                *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
+    // This works because only one task of each type is currently permitted.
+    auto                 pmeGpuTaskMapping     = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    const bool           thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
+    if (thisRankHasPmeGpuTask)
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+        // TODO It would be nice to move this logic into the factory
+        // function. See Redmine #2535.
+        bool isMasterThread = !GMX_THREAD_MPI || MASTER(cr);
+        if (pmeRunMode == PmeRunMode::GPU && !initializedClfftLibrary && isMasterThread)
+        {
+            initializedClfftLibrary = initializeClfftLibrary();
+        }
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
+                                  *hwinfo->hardwareTopology,
+                                  physicalNodeComm, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        int numThreadsOnThisNode, intraNodeThreadOffset;
+        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
+                                 &intraNodeThreadOffset);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, numThreadsOnThisNode,
+                                intraNodeThreadOffset, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
+                             &mdrunOptions
+                                 .checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms>     mdAtoms;
+    std::unique_ptr<gmx_vsite_t> vsite;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules->initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, &mtop, cr, box,
+                      opt2fn("-table", filenames.size(), filenames.data()),
+                      opt2fn("-tablep", filenames.size(), filenames.data()),
+                      opt2fns("-tableb", filenames.size(), filenames.data()),
+                      *hwinfo, nonbondedDeviceInfo,
+                      useGpuForBonded,
+                      FALSE,
+                      pforce);
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
+        if (globalState && thisRankHasPmeGpuTask)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr,
+                                       getNumPmeDomains(cr->dd),
+                                       inputrec,
+                                       mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec,
+                          &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
+            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
+            {
+                initPullHistory(inputrec->pull_work, &observablesHistory);
+            }
+            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
+            {
+                init_pull_output_files(inputrec->pull_work,
+                                       filenames.size(), filenames.data(), oenv,
+                                       continuationOptions);
+            }
+        }
+
+        std::unique_ptr<EnforcedRotation> enforcedRotation;
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            enforcedRotation = init_rot(fplog,
+                                        inputrec,
+                                        filenames.size(),
+                                        filenames.data(),
+                                        cr,
+                                        &atomSets,
+                                        globalState.get(),
+                                        &mtop,
+                                        oenv,
+                                        mdrunOptions);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, inputrec, opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
+                            &mtop, globalState.get(), &observablesHistory,
+                            cr, &atomSets, oenv, mdrunOptions);
+        }
+
+        /* Let makeConstraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
+                                    || observablesHistory.edsamHistory);
+        auto constr              = makeConstraints(mtop, *inputrec, doEssentialDynamics,
+                                                   fplog, *mdAtoms->mdatoms(),
+                                                   cr, ms, nrnb, wcycle, fr->bMolPBC);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work. Later, it should probably be made/updated
+        // after the workload for the lifetime of a PP domain is
+        // understood.
+        PpForceWorkload ppForceWorkload;
+
+        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to integrator.");
+        /* Now do whatever the user wants us to do (how flexible...) */
+        Integrator integrator {
+            fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
+            oenv,
+            mdrunOptions,
+            vsite.get(), constr.get(),
+            enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
+            deform.get(),
+            mdModules->outputProvider(),
+            inputrec, &mtop,
+            fcd,
+            globalState.get(),
+            &observablesHistory,
+            mdAtoms.get(), nrnb, wcycle, fr,
+            &ppForceWorkload,
+            replExParams,
+            membed,
+            walltime_accounting,
+            std::move(stopHandlerBuilder_)
+        };
+        integrator.run(inputrec->eI, doRerun);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+    mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, physicalNodeComm);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+    done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr);
+    sfree(fcd);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+    sfree(nrnb);
+
+    // Ensure log file content is written
+    if (logFileHandle)
+    {
+        gmx_fio_flush(logFileHandle);
+    }
+
+    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
+     * exceptions were enabled before function was called. */
+    if (bEnableFPE)
+    {
+        gmx_fedisableexcept();
+    }
+
+    rc = static_cast<int>(gmx_get_stop_condition());
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        done_commrec(cr);
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+Mdrunner::~Mdrunner()
+{
+    // Clean up of the Manager.
+    // This will end up getting called on every thread-MPI rank, which is unnecessary,
+    // but okay as long as threads synchronize some time before adding or accessing
+    // a new set of restraints.
+    if (restraintManager_)
+    {
+        restraintManager_->clear();
+        GMX_ASSERT(restraintManager_->countRestraints() == 0,
+                   "restraints added during runner life time should be cleared at runner destruction.");
+    }
+};
+
+void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
+                            std::string                               name)
+{
+    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
+    // Not sure if this should be logged through the md logger or something else,
+    // but it is helpful to have some sort of INFO level message sent somewhere.
+    //    std::cout << "Registering restraint named " << name << std::endl;
+
+    // When multiple restraints are used, it may be wasteful to register them separately.
+    // Maybe instead register an entire Restraint Manager as a force provider.
+    restraintManager_->addToSpec(std::move(puller),
+                                 std::move(name));
+}
+
+Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
+
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
+Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
+
+class Mdrunner::BuilderImplementation
+{
+    public:
+        BuilderImplementation() = delete;
+        explicit BuilderImplementation(SimulationContext* context);
+        ~BuilderImplementation();
+
+        BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
+                                                    real                forceWarningThreshold);
+
+        void addDomdec(const DomdecOptions &options);
+
+        void addVerletList(int nstlist);
+
+        void addReplicaExchange(const ReplicaExchangeParameters &params);
+
+        void addMultiSim(gmx_multisim_t* multisim);
+
+        void addNonBonded(const char* nbpu_opt);
+
+        void addPME(const char* pme_opt_, const char* pme_fft_opt_);
+
+        void addBondedTaskAssignment(const char* bonded_opt);
+
+        void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
+
+        void addFilenames(ArrayRef <const t_filenm> filenames);
+
+        void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
+
+        void addLogFile(t_fileio *logFileHandle);
+
+        void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
+
+        Mdrunner build();
+
+    private:
+        // Default parameters copied from runner.h
+        // \todo Clarify source(s) of default parameters.
+
+        const char* nbpu_opt_    = nullptr;
+        const char* pme_opt_     = nullptr;
+        const char* pme_fft_opt_ = nullptr;
+        const char *bonded_opt_  = nullptr;
+
+        MdrunOptions                          mdrunOptions_;
+
+        DomdecOptions                         domdecOptions_;
+
+        ReplicaExchangeParameters             replicaExchangeParameters_;
+
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int         nstlist_ = 0;
+
+        //! Non-owning multisim communicator handle.
+        std::unique_ptr<gmx_multisim_t*>      multisim_ = nullptr;
+
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real forceWarningThreshold_ = -1;
+
+        /*! \brief  Non-owning pointer to SimulationContext (owned and managed by client)
+         *
+         * \internal
+         * \todo Establish robust protocol to make sure resources remain valid.
+         * SimulationContext will likely be separated into multiple layers for
+         * different levels of access and different phases of execution. Ref
+         * https://redmine.gromacs.org/issues/2375
+         * https://redmine.gromacs.org/issues/2587
+         */
+        SimulationContext* context_ = nullptr;
+
+        //! \brief Parallelism information.
+        gmx_hw_opt_t hardwareOptions_;
+
+        //! filename options for simulation.
+        ArrayRef<const t_filenm> filenames_;
+
+        /*! \brief Handle to output environment.
+         *
+         * \todo gmx_output_env_t needs lifetime management.
+         */
+        gmx_output_env_t*    outputEnvironment_ = nullptr;
+
+        /*! \brief Non-owning handle to MD log file.
+         *
+         * \todo Context should own output facilities for client.
+         * \todo Improve log file handle management.
+         * \internal
+         * Code managing the FILE* relies on the ability to set it to
+         * nullptr to check whether the filehandle is valid.
+         */
+        t_fileio* logFileHandle_ = nullptr;
+
+        /*!
+         * \brief Builder for simulation stop signal handler.
+         */
+        std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+};
+
+Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* context) :
+    context_(context)
+{
+    GMX_ASSERT(context_, "Bug found. It should not be possible to construct builder without a valid context.");
+}
+
+Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
+
+Mdrunner::BuilderImplementation &
+Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    mdrunOptions_          = options;
+    forceWarningThreshold_ = forceWarningThreshold;
+    return *this;
+}
+
+void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
+{
+    domdecOptions_ = options;
+}
+
+void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
+{
+    nstlist_ = nstlist;
+}
+
+void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    replicaExchangeParameters_ = params;
+}
+
+void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
+{
+    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+}
+
+Mdrunner Mdrunner::BuilderImplementation::build()
+{
+    auto newRunner = Mdrunner();
+
+    GMX_ASSERT(context_, "Bug found. It should not be possible to call build() without a valid context.");
+
+    newRunner.mdrunOptions    = mdrunOptions_;
+    newRunner.domdecOptions   = domdecOptions_;
+
+    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
+    newRunner.hw_opt          = hardwareOptions_;
+
+    // No invariant to check. This parameter exists to optionally override other behavior.
+    newRunner.nstlist_cmdline = nstlist_;
+
+    newRunner.replExParams    = replicaExchangeParameters_;
+
+    newRunner.filenames = filenames_;
+
+    GMX_ASSERT(context_->communicationRecord_, "SimulationContext communications not initialized.");
+    newRunner.cr = context_->communicationRecord_;
+
+    if (multisim_)
+    {
+        // nullptr is a valid value for the multisim handle, so we don't check the pointed-to pointer.
+        newRunner.ms = *multisim_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addMultiSim() is required before build()"));
+    }
+
+    // \todo Clarify ownership and lifetime management for gmx_output_env_t
+    // \todo Update sanity checking when output environment has clearly specified invariants.
+    // Initialization and default values for oenv are not well specified in the current version.
+    if (outputEnvironment_)
+    {
+        newRunner.oenv  = outputEnvironment_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
+    }
+
+    newRunner.logFileHandle = logFileHandle_;
+
+    if (nbpu_opt_)
+    {
+        newRunner.nbpu_opt    = nbpu_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
+    }
+
+    if (pme_opt_ && pme_fft_opt_)
+    {
+        newRunner.pme_opt     = pme_opt_;
+        newRunner.pme_fft_opt = pme_fft_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
+    }
+
+    if (bonded_opt_)
+    {
+        newRunner.bonded_opt = bonded_opt_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
+    }
+
+    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+
+    if (stopHandlerBuilder_)
+    {
+        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
+    }
+    else
+    {
+        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+    }
+
+    return newRunner;
+}
+
+void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
+{
+    nbpu_opt_ = nbpu_opt;
+}
+
+void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
+                                             const char* pme_fft_opt)
+{
+    pme_opt_     = pme_opt;
+    pme_fft_opt_ = pme_fft_opt;
+}
+
+void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
+{
+    bonded_opt_ = bonded_opt;
+}
+
+void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    hardwareOptions_ = hardwareOptions;
+}
+
+void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    filenames_ = filenames;
+}
+
+void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    outputEnvironment_ = outputEnvironment;
+}
+
+void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
+{
+    logFileHandle_ = logFileHandle;
+}
+
+void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    stopHandlerBuilder_ = std::move(builder);
+}
+
+MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
+    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+{
+}
+
+MdrunnerBuilder::~MdrunnerBuilder() = default;
+
+MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions &options,
+                                                      real                forceWarningThreshold)
+{
+    impl_->setExtraMdrunOptions(options, forceWarningThreshold);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
+{
+    impl_->addDomdec(options);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
+{
+    impl_->addVerletList(nstlist);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
+{
+    impl_->addReplicaExchange(params);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addMultiSim(gmx_multisim_t* multisim)
+{
+    impl_->addMultiSim(multisim);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
+{
+    impl_->addNonBonded(nbpu_opt);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
+                                                    const char* pme_fft_opt)
+{
+    // The builder method may become more general in the future, but in this version,
+    // parameters for PME electrostatics are both required and the only parameters
+    // available.
+    if (pme_opt && pme_fft_opt)
+    {
+        impl_->addPME(pme_opt, pme_fft_opt);
+    }
+    else
+    {
+        GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
+    }
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
+{
+    impl_->addBondedTaskAssignment(bonded_opt);
+    return *this;
+}
+
+Mdrunner MdrunnerBuilder::build()
+{
+    return impl_->build();
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
+{
+    impl_->addHardwareOptions(hardwareOptions);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
+{
+    impl_->addFilenames(filenames);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
+{
+    impl_->addOutputEnvironment(outputEnvironment);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
+{
+    impl_->addLogFile(logFileHandle);
+    return *this;
+}
+
+MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
+{
+    impl_->addStopHandlerBuilder(std::move(builder));
+    return *this;
+}
+
+MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
+
+MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
+
+} // namespace gmx