diff --git a/CHANGES/v2.4.md b/CHANGES/v2.4.md
index a890fe6ecd5e8eb803345108dc230dc5c7d96e79..d682fb6e6c45e16e3d75325f20f9d63f950716ae 100644
--- a/CHANGES/v2.4.md
+++ b/CHANGES/v2.4.md
@@ -177,3 +177,8 @@ For developers:
   - `plumed patch -p` command can be used twice without triggering an error. This will allow e.g. building again
     on MacPorts in cases where the build was interrupted. Notice that this only works for patches without special
     after/before patch/revert functions.
+
+## Version 2.4.2 (unreleased)
+
+For users:
+  - GROMACS patch for gromacs-2018.1.
diff --git a/patches/gromacs-2018.1.config b/patches/gromacs-2018.1.config
new file mode 100644
index 0000000000000000000000000000000000000000..4348d93fd2f9c9db2567ee332f7ba123c58b7abc
--- /dev/null
+++ b/patches/gromacs-2018.1.config
@@ -0,0 +1,33 @@
+
+
+function plumed_preliminary_test(){
+# check if the README contains the word GROMACS and if gromacs has been already configured
+  grep -q GROMACS README 1>/dev/null 2>/dev/null
+}
+
+function plumed_patch_info(){
+cat << EOF
+PLUMED can be incorporated into gromacs using the standard patching procedure.
+Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
+
+On clusters you may want to patch gromacs using the static version of plumed, in this case
+building gromacs can result in multiple errors. One possible solution is to configure gromacs
+with these additional options:
+
+cmake -DBUILD_SHARED_LIBS=OFF -DGMX_PREFER_STATIC_LIBS=ON
+
+To enable PLUMED in a gromacs simulation one should use
+mdrun with an extra -plumed flag. The flag can be used to
+specify the name of the PLUMED input file, e.g.:
+
+gmx mdrun -plumed plumed.dat
+
+For more information on gromacs you should visit http://www.gromacs.org
+
+EOF
+}
+
+plumed_before_patch(){
+  plumed_patch_info
+}
+
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..41d5aa5a821363e40f96c1826dd639bf1cee3400
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt
@@ -0,0 +1,268 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${INCL_INSTALL_DIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
+
+get_lmfit_properties(LMFIT_SOURCES LMFIT_LIBRARIES_TO_LINK LMFIT_INCLUDE_DIRECTORY LMFIT_INCLUDE_DIR_ORDER)
+include_directories(${LMFIT_INCLUDE_DIR_ORDER} SYSTEM "${LMFIT_INCLUDE_DIRECTORY}")
+list(APPEND LIBGROMACS_SOURCES ${LMFIT_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.c)
+gmx_configure_version_file(
+    utility/baseversion-gen.c.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH)
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        if (${_ext} STREQUAL ".cu")
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-Wno-unused HAS_NO_UNUSED)
+if (HAS_NO_UNUSED)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter -Wno-unused-function")
+endif()
+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
+
+if(SIMD_AVX_512_CXX_SUPPORTED)
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${LMFIT_LIBRARIES_TO_LINK}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      ${PLUMED_LOAD}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+            RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+            ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
+
+if (BUILD_TESTING)
+    add_subdirectory(compat/tests)
+endif()
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a88015302587636c593e91149a664756f292b4da
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -0,0 +1,265 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${INCL_INSTALL_DIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(applied-forces)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(tables)
+add_subdirectory(taskassignment)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(awh)
+add_subdirectory(simd)
+add_subdirectory(imd)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(energyanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
+
+get_lmfit_properties(LMFIT_SOURCES LMFIT_LIBRARIES_TO_LINK LMFIT_INCLUDE_DIRECTORY LMFIT_INCLUDE_DIR_ORDER)
+include_directories(${LMFIT_INCLUDE_DIR_ORDER} SYSTEM "${LMFIT_INCLUDE_DIRECTORY}")
+list(APPEND LIBGROMACS_SOURCES ${LMFIT_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.c)
+gmx_configure_version_file(
+    utility/baseversion-gen.c.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH)
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        if (${_ext} STREQUAL ".cu")
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
+if (GMX_USE_CUDA)
+    # Work around FindCUDA that prevents using target_link_libraries()
+    # with keywords otherwise...
+    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-Wno-unused HAS_NO_UNUSED)
+if (HAS_NO_UNUSED)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter -Wno-unused-function")
+endif()
+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
+
+if(SIMD_AVX_512_CXX_SUPPORTED)
+    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
+endif()
+
+gmx_setup_tng_for_libgromacs()
+
+target_link_libraries(libgromacs
+                      PRIVATE
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${GMX_COMMON_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${LMFIT_LIBRARIES_TO_LINK}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES}
+                      PUBLIC
+                      ${GMX_PUBLIC_LIBRARIES}
+                      )
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+            RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+            ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
+
+if (BUILD_TESTING)
+    add_subdirectory(compat/tests)
+endif()
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc1e961f1c77387a4dd7f103c1fc51acbbca9119
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp
@@ -0,0 +1,893 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include <cmath>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/genborn.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+/* PLUMED */
+#include "../../../Plumed.h"
+int    plumedswitch=0;
+plumed plumedmain;
+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
+/* END PLUMED */
+
+
+void ns(FILE              *fp,
+        t_forcerec        *fr,
+        matrix             box,
+        gmx_groups_t      *groups,
+        gmx_localtop_t    *top,
+        t_mdatoms         *md,
+        t_commrec         *cr,
+        t_nrnb            *nrnb,
+        gmx_bool           bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
+                       t_idef     *idef,    t_commrec  *cr,
+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
+                       t_mdatoms  *md,
+                       rvec       x[],      history_t  *hist,
+                       rvec      *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t *enerd,
+                       t_fcdata   *fcd,
+                       gmx_localtop_t *top,
+                       gmx_genborn_t *born,
+                       gmx_bool       bBornRadii,
+                       matrix     box,
+                       t_lambda   *fepvals,
+                       real       *lambda,
+                       t_graph    *graph,
+                       t_blocka   *excl,
+                       rvec       mu_tot[],
+                       int        flags,
+                       float     *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(ir, fr, box, md, x, forceForUseWithShiftForces, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* If doing GB, reset dvda and calculate the Born radii */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+
+        for (i = 0; i < born->nr; i++)
+        {
+            fr->dvda[i] = 0;
+        }
+
+        if (bBornRadii)
+        {
+            calc_gb_rad(cr, fr, ir, top, x, fr->gblist, born, md, nrnb);
+        }
+
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    where();
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+        where();
+    }
+
+    /* If we are doing GB, calculate bonded forces and apply corrections
+     * to the solvation forces */
+    /* MRS: Eventually, many need to include free energy contribution here! */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsLISTED);
+        calc_gb_forces(cr, md, born, top, x, forceForUseWithShiftForces, fr, idef,
+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
+        wallcycle_sub_stop(wcycle, ewcsLISTED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr,
+                    idef, (const rvec *) x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->gatindex : nullptr,
+                    flags);
+
+    where();
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           md->nChargePerturbed || md->nTypePerturbed,
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+    where();
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      int plumedNeedsEnergy;
+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
+    }
+    /* END PLUMED */
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+    double dlam;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    /* Notes on the foreign lambda free energy difference evaluation:
+     * Adding the potential and ekin terms that depend linearly on lambda
+     * as delta lam * dvdl to the energy differences is exact.
+     * For the constraints this is not exact, but we have no other option
+     * without literally changing the lengths and reevaluating the energies at each step.
+     * (try to remedy this post 4.6 - MRS)
+     */
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+    enerd->term[F_DVDL_CONSTR] = 0;
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        for (size_t j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            dlam = (fepvals->all_lambda[j][i] - lambda[j]);
+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..f01ea4d333a52e51e9f192758e13b44295114843
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/force.cpp.preplumed
@@ -0,0 +1,879 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include <cmath>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/genborn.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+void ns(FILE              *fp,
+        t_forcerec        *fr,
+        matrix             box,
+        gmx_groups_t      *groups,
+        gmx_localtop_t    *top,
+        t_mdatoms         *md,
+        t_commrec         *cr,
+        t_nrnb            *nrnb,
+        gmx_bool           bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void clearEwaldThreadOutput(ewald_corr_thread_t *ewc_t)
+{
+    ewc_t->Vcorr_q        = 0;
+    ewc_t->Vcorr_lj       = 0;
+    ewc_t->dvdl[efptCOUL] = 0;
+    ewc_t->dvdl[efptVDW]  = 0;
+    clear_mat(ewc_t->vir_q);
+    clear_mat(ewc_t->vir_lj);
+}
+
+static void reduceEwaldThreadOuput(int nthreads, ewald_corr_thread_t *ewc_t)
+{
+    ewald_corr_thread_t &dest = ewc_t[0];
+
+    for (int t = 1; t < nthreads; t++)
+    {
+        dest.Vcorr_q        += ewc_t[t].Vcorr_q;
+        dest.Vcorr_lj       += ewc_t[t].Vcorr_lj;
+        dest.dvdl[efptCOUL] += ewc_t[t].dvdl[efptCOUL];
+        dest.dvdl[efptVDW]  += ewc_t[t].dvdl[efptVDW];
+        m_add(dest.vir_q,  ewc_t[t].vir_q,  dest.vir_q);
+        m_add(dest.vir_lj, ewc_t[t].vir_lj, dest.vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
+                       t_idef     *idef,    t_commrec  *cr,
+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
+                       t_mdatoms  *md,
+                       rvec       x[],      history_t  *hist,
+                       rvec      *forceForUseWithShiftForces,
+                       gmx::ForceWithVirial *forceWithVirial,
+                       gmx_enerdata_t *enerd,
+                       t_fcdata   *fcd,
+                       gmx_localtop_t *top,
+                       gmx_genborn_t *born,
+                       gmx_bool       bBornRadii,
+                       matrix     box,
+                       t_lambda   *fepvals,
+                       real       *lambda,
+                       t_graph    *graph,
+                       t_blocka   *excl,
+                       rvec       mu_tot[],
+                       int        flags,
+                       float     *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    int         pme_flags;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(ir, fr, box, md, x, forceForUseWithShiftForces, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* If doing GB, reset dvda and calculate the Born radii */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+
+        for (i = 0; i < born->nr; i++)
+        {
+            fr->dvda[i] = 0;
+        }
+
+        if (bBornRadii)
+        {
+            calc_gb_rad(cr, fr, ir, top, x, fr->gblist, born, md, nrnb);
+        }
+
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    where();
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, forceForUseWithShiftForces, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+        where();
+    }
+
+    /* If we are doing GB, calculate bonded forces and apply corrections
+     * to the solvation forces */
+    /* MRS: Eventually, many need to include free energy contribution here! */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsLISTED);
+        calc_gb_forces(cr, md, born, top, x, forceForUseWithShiftForces, fr, idef,
+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
+        wallcycle_sub_stop(wcycle, ewcsLISTED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->ic->eeltype) || EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr,
+                    idef, (const rvec *) x, hist,
+                    forceForUseWithShiftForces, forceWithVirial,
+                    fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->gatindex : nullptr,
+                    flags);
+
+    where();
+
+    *cycles_pme = 0;
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0;
+
+        /* We reduce all virial, dV/dlambda and energy contributions, except
+         * for the reciprocal energies (Vlr_q, Vlr_lj) into the same struct.
+         */
+        ewald_corr_thread_t &ewaldOutput = fr->ewc_t[0];
+        clearEwaldThreadOutput(&ewaldOutput);
+
+        if (EEL_PME_EWALD(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        ewald_corr_thread_t &ewc_t = fr->ewc_t[t];
+                        if (t > 0)
+                        {
+                            clearEwaldThreadOutput(&ewc_t);
+                        }
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single forceWithVirial->force_ array.
+                         */
+                        ewald_LRcorrection(md->homenr, cr, nthreads, t, fr, ir,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           md->nChargePerturbed || md->nTypePerturbed,
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           as_rvec_array(forceWithVirial->force_.data()),
+                                           ewc_t.vir_q, ewc_t.vir_lj,
+                                           &ewc_t.Vcorr_q, &ewc_t.Vcorr_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           &ewc_t.dvdl[efptCOUL], &ewc_t.dvdl[efptVDW]);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduceEwaldThreadOuput(nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                ewaldOutput.Vcorr_q +=
+                    ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                            &ewaldOutput.dvdl[efptCOUL],
+                                            ewaldOutput.vir_q);
+            }
+
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) &&
+                thisRankHasDuty(cr, DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+
+                    /* With domain decomposition we close the CPU side load
+                     * balancing region here, because PME does global
+                     * communication that acts as a global barrier.
+                     */
+                    if (DOMAINDECOMP(cr))
+                    {
+                        ddCloseBalanceRegionCpu(cr->dd);
+                    }
+
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x,
+                                        as_rvec_array(forceWithVirial->force_.data()),
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        ewaldOutput.vir_q, ewaldOutput.vir_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &ewaldOutput.dvdl[efptCOUL],
+                                        &ewaldOutput.dvdl[efptVDW],
+                                        pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->ic->eeltype) && EEL_PME_EWALD(fr->ic->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, as_rvec_array(forceWithVirial->force_.data()),
+                             md->chargeA, md->chargeB,
+                             box, cr, md->homenr,
+                             ewaldOutput.vir_q, fr->ic->ewaldcoeff_q,
+                             lambda[efptCOUL], &ewaldOutput.dvdl[efptCOUL],
+                             fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_q);
+        forceWithVirial->addVirialContribution(ewaldOutput.vir_lj);
+        enerd->dvdl_lin[efptCOUL] += ewaldOutput.dvdl[efptCOUL];
+        enerd->dvdl_lin[efptVDW]  += ewaldOutput.dvdl[efptVDW];
+        enerd->term[F_COUL_RECIP]  = Vlr_q + ewaldOutput.Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + ewaldOutput.Vcorr_lj;
+
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->ic->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, x, forceForUseWithShiftForces,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+    where();
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, gmx::ArrayRef<const real> lambda, t_lambda *fepvals)
+{
+    int    index;
+    double dlam;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (int i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    /* Notes on the foreign lambda free energy difference evaluation:
+     * Adding the potential and ekin terms that depend linearly on lambda
+     * as delta lam * dvdl to the energy differences is exact.
+     * For the constraints this is not exact, but we have no other option
+     * without literally changing the lengths and reevaluating the energies at each step.
+     * (try to remedy this post 4.6 - MRS)
+     */
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+    enerd->term[F_DVDL_CONSTR] = 0;
+
+    for (int i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        for (size_t j = 0; j < lambda.size(); j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            dlam = (fepvals->all_lambda[j][i] - lambda[j]);
+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fcef56734af9af59e45975c542c22f2ffadef62
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp
@@ -0,0 +1,3052 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "minimize.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state          s;
+    //! Force array
+    PaddedRVecVector f;
+    //! Potential energy
+    real             epot;
+    //! Norm of the force
+    real             fnorm;
+    //! Maximum force
+    real             fmax;
+    //! Direction
+    int              a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           t_commrec                *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
+{
+    char buffer[2048];
+    if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                sizeof(real) < sizeof(double) ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->gatindex[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = (int)(sum[2*i+1] + 0.5);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, as_rvec_array(ems->f.data()),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog, const char *title,
+                    t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx_constr_t constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, top_global, fplog, 1,
+             MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      n_flexible_constraints(constr),
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        ems->f.resize(gmx::paddedRVecVectorSize(ems->s.natoms));
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, *top, ir, mdatoms, cr);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constrain(PAR(cr) ? nullptr : fplog, TRUE, TRUE, constr, &(*top)->idef,
+                      ir, cr, -1, 0, 1.0, mdatoms,
+                      as_rvec_array(ems->s.x.data()),
+                      as_rvec_array(ems->s.x.data()),
+                      nullptr,
+                      fr->bMolPBC, ems->s.box,
+                      ems->s.lambda[efptFEP], &dvdl_constr,
+                      nullptr, nullptr, nrnb, econqCoord);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+
+    /* PLUMED */
+    if(plumedswitch){
+      if(cr->ms && cr->ms->nsim>1) {
+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        (*plumedcmd) (plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }else{
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
+        }
+      }
+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
+      (*plumedcmd) (plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
+      (*plumedcmd) (plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+        }
+      }
+    }
+    /* END PLUMED */
+}
+
+//! Finalize the minimization
+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, gmx_int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, (double)step,
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr && MASTER(cr))
+    {
+        GMX_RELEASE_ASSERT(bX, "The code below assumes that (with domain decomposition), x is collected to state_global in the call above.");
+        /* With domain decomposition the call above collected the state->s.x
+         * into state_global->x. Without DD we copy the local state pointer.
+         */
+        if (!DOMAINDECOMP(cr))
+        {
+            state_global = &state->s;
+        }
+
+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+        {
+            /* Make molecules whole only for confout writing */
+            do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                        as_rvec_array(state_global->x.data()));
+        }
+
+        write_sto_conf_mtop(confout,
+                            *top_global->name, top_global,
+                            as_rvec_array(state_global->x.data()), nullptr, ir->ePBC, state->s.box);
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
+                       gmx_bool bMolPBC,
+                       em_state_t *ems1, real a, const PaddedRVecVector *force,
+                       em_state_t *ems2,
+                       gmx_constr_t constr, gmx_localtop_t *top,
+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                       gmx_int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        ems2->f.resize(gmx::paddedRVecVectorSize(s2->natoms));
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    // cppcheck-suppress unreadVariable
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = as_rvec_array(s1->x.data());
+        rvec       *x2 = as_rvec_array(s2->x.data());
+        const rvec *f  = as_rvec_array(force->data());
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = as_rvec_array(s1->cg_p.data());
+            rvec       *p2 = as_rvec_array(s2->cg_p.data());
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        validStep   =
+            constrain(nullptr, TRUE, TRUE, constr, &top->idef,
+                      ir, cr, count, 0, 1.0, md,
+                      as_rvec_array(s1->x.data()), as_rvec_array(s2->x.data()),
+                      nullptr, bMolPBC, s2->box,
+                      s2->lambda[efptBONDED], &dvdl_constr,
+                      nullptr, nullptr, nrnb, econqCoord);
+        wallcycle_stop(wcycle, ewcCONSTR);
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+//! De one energy evaluation
+static void evaluate_energy(FILE *fplog, t_commrec *cr,
+                            gmx_mtop_t *top_global,
+                            em_state_t *ems, gmx_localtop_t *top,
+                            t_inputrec *inputrec,
+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                            gmx_global_stat_t gstat,
+                            gmx_vsite_t *vsite, gmx_constr_t constr,
+                            t_fcdata *fcd,
+                            t_graph *graph, gmx::MDAtoms *mdAtoms,
+                            t_forcerec *fr, rvec mu_tot,
+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
+                            gmx_int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, as_rvec_array(ems->s.x.data()), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    matrix plumed_vir;
+    if(plumedswitch){
+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&lstep);
+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
+      (*plumedcmd) (plumedmain,"setMasses",&mdAtoms->mdatoms()->massT[0]);
+      (*plumedcmd) (plumedmain,"setCharges",&mdAtoms->mdatoms()->chargeA[0]);
+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      clear_mat(plumed_vir);
+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
+    }
+    /* END PLUMED */
+
+    do_force(fplog, cr, inputrec,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x, &ems->s.hist,
+             ems->f, force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, vsite, mu_tot, t, nullptr, TRUE,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+    /* PLUMED */
+    if(plumedswitch){
+      if(plumedNeedsEnergy) {
+        msmul(force_vir,2.0,plumed_vir);
+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+        (*plumedcmd) (plumedmain,"performCalc",NULL);
+        msmul(plumed_vir,0.5,force_vir);
+      } else {
+        msmul(plumed_vir,0.5,plumed_vir);
+        m_add(force_vir,plumed_vir,force_vir);
+      }
+    }
+    /* END PLUMED */
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        rvec *f_rvec = as_rvec_array(ems->f.data());
+        constrain(nullptr, FALSE, FALSE, constr, &top->idef,
+                  inputrec, cr, count, 0, 1.0, mdAtoms->mdatoms(),
+                  as_rvec_array(ems->s.x.data()), f_rvec, f_rvec,
+                  fr->bMolPBC, ems->s.box,
+                  ems->s.lambda[efptBONDED], &dvdl_constr,
+                  nullptr, &shake_vir, nrnb, econqForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = as_rvec_array(s_min->f.data());
+    const rvec *fb = as_rvec_array(s_b->f.data());
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = as_rvec_array(s_min->f.data());
+        const rvec *fb  = as_rvec_array(s_b->f.data());
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+/*! \brief Do conjugate gradients minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv,
+                           const MdrunOptions &mdrunOptions,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           gmx::IMDOutputProvider *outputProvider,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           gmx::MDAtoms *mdAtoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           const ReplicaExchangeParameters &replExParams,
+                           gmx_membed_t gmx_unused *membed,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_cg(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv,
+             const MdrunOptions &mdrunOptions,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             gmx::IMDOutputProvider *outputProvider,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             ObservablesHistory *observablesHistory,
+             gmx::MDAtoms *mdAtoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             t_forcerec *fr,
+             const ReplicaExchangeParameters gmx_unused &replExParams,
+             gmx_membed_t gmx_unused *membed,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    step = 0;
+
+    // Ensure the extra per-atom state array gets allocated
+    state_global->flags |= (1<<estCGP);
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, CG, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    evaluate_energy(fplog, cr,
+                    top_global, s_min, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+    where();
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = as_rvec_array(s_min->s.cg_p.data());
+        const rvec *sfm = as_rvec_array(s_min->f.data());
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min->s.x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, top, nrnb, wcycle, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        evaluate_energy(fplog, cr,
+                        top_global, s_c, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdAtoms, fr,
+                        mu_tot, enerd, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = as_rvec_array(s_c->s.cg_p.data());
+        const rvec *sfc = as_rvec_array(s_c->f.data());
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, top, nrnb, wcycle, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                evaluate_energy(fplog, cr,
+                                top_global, s_b, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdAtoms, fr,
+                                mu_tot, enerd, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = as_rvec_array(s_b->s.cg_p.data());
+                const rvec *sfb = as_rvec_array(s_b->f.data());
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+
+/*! \brief Do L-BFGS conjugate gradients minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_membed_t gmx_unused *membed,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_lbfgs(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv,
+                const MdrunOptions &mdrunOptions,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                gmx::IMDOutputProvider *outputProvider,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                ObservablesHistory *observablesHistory,
+                gmx::MDAtoms *mdAtoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                t_forcerec *fr,
+                const ReplicaExchangeParameters gmx_unused &replExParams,
+                gmx_membed_t gmx_unused *membed,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, LBFGS, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, as_rvec_array(state_global->x.data()), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    evaluate_energy(fplog, cr,
+                    top_global, &ems, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+    where();
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(as_rvec_array(ems.f.data())[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, (real)step, &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(as_rvec_array(ems.s.x.data())[0]);
+        real *ff = static_cast<real *>(as_rvec_array(ems.f.data())[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(as_rvec_array(last->s.x.data())[0]);
+        real *lastf = static_cast<real *>(as_rvec_array(last->f.data())[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(as_rvec_array(sc->s.x.data())[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        evaluate_energy(fplog, cr,
+                        top_global, sc, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdAtoms, fr,
+                        mu_tot, enerd, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(as_rvec_array(sc->f.data())[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        if (sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp)))
+        {
+            // Great, we found a better energy. We no longer try to alter the
+            // stepsize, but simply accept this new better position. The we select a new
+            // search direction instead, which will be much more efficient than continuing
+            // to take smaller steps along a line. Set fnorm based on the new C position,
+            // which will be used to update the stepsize to 1/fnorm further down.
+            foundlower = TRUE;
+        }
+        else
+        {
+            // If we got here, the energy is NOT lower in point C, i.e. it will be the same
+            // or higher than in point A. In this case it is pointless to move to point C,
+            // so we will have to do more iterations along the same line to find a smaller
+            // value in the interval [A=0.0,C].
+            // Here, A is still 0.0, but that will change when we do a search in the interval
+            // [0.0,C] below. That search we will do by interpolation or bisection rather
+            // than with the stepsize, so no need to modify it. For the next search direction
+            // it will be reset to 1/fnorm anyway.
+            foundlower = FALSE;
+        }
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(as_rvec_array(sb->s.x.data())[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                evaluate_energy(fplog, cr,
+                                top_global, sb, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdAtoms, fr,
+                                mu_tot, enerd, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(as_rvec_array(sb->f.data())[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* swap states b and c */
+                    swap_em_state(&sb, &sc);
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* swap states a and b */
+                    swap_em_state(&sa, &sb);
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/ems.fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do steepest descents minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_steep(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv,
+                const MdrunOptions &mdrunOptions,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                gmx::IMDOutputProvider *outputProvider,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                ObservablesHistory *observablesHistory,
+                gmx::MDAtoms *mdAtoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                t_forcerec *fr,
+                const ReplicaExchangeParameters gmx_unused &replExParams,
+                gmx_membed_t gmx_unused *membed,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, SD, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, top, nrnb, wcycle, count);
+        }
+
+        if (validStep)
+        {
+            evaluate_energy(fplog, cr,
+                            top_global, s_try, top,
+                            inputrec, nrnb, wcycle, gstat,
+                            vsite, constr, fcd, graph, mdAtoms, fr,
+                            mu_tot, enerd, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
+                           s_try->s.box, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != nullptr);
+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do normal modes analysis
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_nm(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv,
+             const MdrunOptions &mdrunOptions,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             gmx::IMDOutputProvider *outputProvider,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             ObservablesHistory gmx_unused *observablesHistory,
+             gmx::MDAtoms *mdAtoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             t_forcerec *fr,
+             const ReplicaExchangeParameters gmx_unused &replExParams,
+             gmx_membed_t gmx_unused *membed,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *fneg, *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, NM, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<size_t> atom_index = get_atom_index(top_global);
+    snew(fneg, atom_index.size());
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%d), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+    where();
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
+                *(top_global->name), (int)inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    evaluate_energy(fplog, cr,
+                    top_global, &state_work, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS = true;
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            gmx_bool    bBornRadii  = FALSE;
+            gmx_int64_t step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work.s.x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work.s.x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work.s.x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    (void) relax_shell_flexcon(fplog, cr, mdrunOptions.verbose, step,
+                                               inputrec, bNS, force_flags,
+                                               top,
+                                               constr, enerd, fcd,
+                                               &state_work.s, &state_work.f, vir, mdatoms,
+                                               nrnb, wcycle, graph, &top_global->groups,
+                                               shellfc, fr, bBornRadii, t, mu_tot,
+                                               vsite,
+                                               DdOpenBalanceRegionBeforeForceComputation::no,
+                                               DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    evaluate_energy(fplog, cr,
+                                    top_global, &state_work, top,
+                                    inputrec, nrnb, wcycle, gstat,
+                                    vsite, constr, fcd, graph, mdAtoms, fr,
+                                    mu_tot, enerd, vir, pres, atom*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    for (size_t i = 0; i < atom_index.size(); i++)
+                    {
+                        copy_rvec(state_work.f[atom_index[i]], fneg[i]);
+                    }
+                }
+            }
+
+            /* x is restored to original */
+            state_work.s.x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work.f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && atom+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (atom + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+
+    return 0;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp.preplumed b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a27575016d14266131847ff9d5e0c6bf55b85ba2
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/gromacs/mdlib/minimize.cpp.preplumed
@@ -0,0 +1,2978 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "minimize.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/smalloc.h"
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state          s;
+    //! Force array
+    PaddedRVecVector f;
+    //! Potential energy
+    real             epot;
+    //! Norm of the force
+    real             fnorm;
+    //! Maximum force
+    real             fmax;
+    //! Direction
+    int              a_fmax;
+} em_state_t;
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           t_commrec                *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
+{
+    char buffer[2048];
+    if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                sizeof(real) < sizeof(double) ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
+                            const em_state_t *ems, double sqrtNumAtoms)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm/sqrtNumAtoms);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
+    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm/sqrtNumAtoms);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->gatindex[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = (int)(sum[2*i+1] + 0.5);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, as_rvec_array(ems->f.data()),
+                   &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+static void init_em(FILE *fplog, const char *title,
+                    t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
+                    t_inputrec *ir,
+                    const MdrunOptions &mdrunOptions,
+                    t_state *state_global, gmx_mtop_t *top_global,
+                    em_state_t *ems, gmx_localtop_t **top,
+                    t_nrnb *nrnb, rvec mu_tot,
+                    t_forcerec *fr, gmx_enerdata_t **enerd,
+                    t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat,
+                    gmx_vsite_t *vsite, gmx_constr_t constr, gmx_shellfc_t **shellfc,
+                    int nfile, const t_filenm fnm[],
+                    gmx_mdoutf_t *outf, t_mdebin **mdebin,
+                    gmx_wallcycle_t wcycle)
+{
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    if (MASTER(cr))
+    {
+        state_global->ngtc = 0;
+
+        /* Initialize lambda variables */
+        initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr);
+    }
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, top_global, fplog, 1,
+             MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+             nfile, fnm, nullptr, mdrunOptions);
+
+    if (ir->eI == eiNM)
+    {
+        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
+
+        *shellfc = init_shell_flexcon(stdout,
+                                      top_global,
+                                      n_flexible_constraints(constr),
+                                      ir->nstcalcenergy,
+                                      DOMAINDECOMP(cr));
+    }
+    else
+    {
+        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support");
+
+        /* With energy minimization, shells and flexible constraints are
+         * automatically minimized when treated like normal DOFS.
+         */
+        if (shellfc != nullptr)
+        {
+            *shellfc = nullptr;
+        }
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdAtoms, *top,
+                            fr, vsite, constr,
+                            nrnb, nullptr, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = nullptr;
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* Just copy the state */
+        ems->s = *state_global;
+        state_change_natoms(&ems->s, ems->s.natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        ems->f.resize(gmx::paddedRVecVectorSize(ems->s.natoms));
+
+        snew(*top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr,
+                                  graph, mdAtoms,
+                                  vsite, shellfc ? *shellfc : nullptr);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms);
+        }
+    }
+
+    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
+
+    if (constr)
+    {
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, *top, ir, mdatoms, cr);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constrain(PAR(cr) ? nullptr : fplog, TRUE, TRUE, constr, &(*top)->idef,
+                      ir, cr, -1, 0, 1.0, mdatoms,
+                      as_rvec_array(ems->s.x.data()),
+                      as_rvec_array(ems->s.x.data()),
+                      nullptr,
+                      fr->bMolPBC, ems->s.box,
+                      ems->s.lambda[efptFEP], &dvdl_constr,
+                      nullptr, nullptr, nrnb, econqCoord);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = nullptr;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != nullptr)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+}
+
+//! Finalize the minimization
+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t **ems1, em_state_t **ems2)
+{
+    em_state_t *tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, gmx_int64_t step,
+                          em_state_t *state,
+                          t_state *state_global,
+                          ObservablesHistory *observablesHistory)
+{
+    int mdof_flags = 0;
+
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, (double)step,
+                                     &state->s, state_global, observablesHistory,
+                                     state->f);
+
+    if (confout != nullptr && MASTER(cr))
+    {
+        GMX_RELEASE_ASSERT(bX, "The code below assumes that (with domain decomposition), x is collected to state_global in the call above.");
+        /* With domain decomposition the call above collected the state->s.x
+         * into state_global->x. Without DD we copy the local state pointer.
+         */
+        if (!DOMAINDECOMP(cr))
+        {
+            state_global = &state->s;
+        }
+
+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+        {
+            /* Make molecules whole only for confout writing */
+            do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global,
+                        as_rvec_array(state_global->x.data()));
+        }
+
+        write_sto_conf_mtop(confout,
+                            *top_global->name, top_global,
+                            as_rvec_array(state_global->x.data()), nullptr, ir->ePBC, state->s.box);
+    }
+}
+
+//! \brief Do one minimization step
+//
+// \returns true when the step succeeded, false when a constraint error occurred
+static bool do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
+                       gmx_bool bMolPBC,
+                       em_state_t *ems1, real a, const PaddedRVecVector *force,
+                       em_state_t *ems2,
+                       gmx_constr_t constr, gmx_localtop_t *top,
+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                       gmx_int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      start, end;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    bool     validStep = true;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->natoms != s1->natoms)
+    {
+        state_change_natoms(s2, s1->natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        ems2->f.resize(gmx::paddedRVecVectorSize(s2->natoms));
+    }
+    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
+    {
+        s2->cg_gl.resize(s1->cg_gl.size());
+    }
+
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    s2->lambda = s1->lambda;
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    // cppcheck-suppress unreadVariable
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        const rvec *x1 = as_rvec_array(s1->x.data());
+        rvec       *x2 = as_rvec_array(s2->x.data());
+        const rvec *f  = as_rvec_array(force->data());
+
+        int         gf = 0;
+#pragma omp for schedule(static) nowait
+        for (int i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (int m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            const rvec *p1 = as_rvec_array(s1->cg_p.data());
+            rvec       *p2 = as_rvec_array(s2->cg_p.data());
+#pragma omp for schedule(static) nowait
+            for (int i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(p1[i], p2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+
+            /* OpenMP does not supported unsigned loop variables */
+#pragma omp for schedule(static) nowait
+            for (int i = 0; i < static_cast<int>(s2->cg_gl.size()); i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        validStep   =
+            constrain(nullptr, TRUE, TRUE, constr, &top->idef,
+                      ir, cr, count, 0, 1.0, md,
+                      as_rvec_array(s1->x.data()), as_rvec_array(s2->x.data()),
+                      nullptr, bMolPBC, s2->box,
+                      s2->lambda[efptBONDED], &dvdl_constr,
+                      nullptr, nullptr, nrnb, econqCoord);
+        wallcycle_stop(wcycle, ewcCONSTR);
+
+        // We should move this check to the different minimizers
+        if (!validStep && ir->eI != eiSteep)
+        {
+            gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.",
+                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
+        }
+    }
+
+    return validStep;
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   gmx::MDAtoms *mdAtoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, step, cr, FALSE, 1,
+                        nullptr, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdAtoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+//! De one energy evaluation
+static void evaluate_energy(FILE *fplog, t_commrec *cr,
+                            gmx_mtop_t *top_global,
+                            em_state_t *ems, gmx_localtop_t *top,
+                            t_inputrec *inputrec,
+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                            gmx_global_stat_t gstat,
+                            gmx_vsite_t *vsite, gmx_constr_t constr,
+                            t_fcdata *fcd,
+                            t_graph *graph, gmx::MDAtoms *mdAtoms,
+                            t_forcerec *fr, rvec mu_tot,
+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
+                            gmx_int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, as_rvec_array(ems->s.x.data()), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                               ems, top, mdAtoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    do_force(fplog, cr, inputrec,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x, &ems->s.hist,
+             ems->f, force_vir, mdAtoms->mdatoms(), enerd, fcd,
+             ems->s.lambda, graph, fr, vsite, mu_tot, t, nullptr, TRUE,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0),
+             DOMAINDECOMP(cr) ?
+             DdOpenBalanceRegionBeforeForceComputation::yes :
+             DdOpenBalanceRegionBeforeForceComputation::no,
+             DOMAINDECOMP(cr) ?
+             DdCloseBalanceRegionAfterForceComputation::yes :
+             DdCloseBalanceRegionAfterForceComputation::no);
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, nullptr, nullptr, nullptr, 1, &terminate,
+                    nullptr, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        rvec *f_rvec = as_rvec_array(ems->f.data());
+        constrain(nullptr, FALSE, FALSE, constr, &top->idef,
+                  inputrec, cr, count, 0, 1.0, mdAtoms->mdatoms(),
+                  as_rvec_array(ems->s.x.data()), f_rvec, f_rvec,
+                  fr->bMolPBC, ems->s.box,
+                  ems->s.lambda[efptBONDED], &dvdl_constr,
+                  nullptr, &shake_vir, nrnb, econqForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
+    }
+}
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    const rvec *fm = as_rvec_array(s_min->f.data());
+    const rvec *fb = as_rvec_array(s_b->f.data());
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    rvec *fmg;
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.cg_gl.size();
+    cg_gl = s_min->s.cg_gl.data();
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.cg_gl.size();
+    cg_gl       = s_b->s.cg_gl.data();
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    double sum;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        const rvec *fm  = as_rvec_array(s_min->f.data());
+        const rvec *fb  = as_rvec_array(s_b->f.data());
+        sum             = 0;
+        int         gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (int m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+/*! \brief Do conjugate gradients minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv,
+                           const MdrunOptions &mdrunOptions,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           gmx::IMDOutputProvider *outputProvider,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           gmx::MDAtoms *mdAtoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           const ReplicaExchangeParameters &replExParams,
+                           gmx_membed_t gmx_unused *membed,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_cg(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv,
+             const MdrunOptions &mdrunOptions,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             gmx::IMDOutputProvider *outputProvider,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             ObservablesHistory *observablesHistory,
+             gmx::MDAtoms *mdAtoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             t_forcerec *fr,
+             const ReplicaExchangeParameters gmx_unused &replExParams,
+             gmx_membed_t gmx_unused *membed,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    double            tmp, minstep;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               m, step, nminstep;
+    auto              mdatoms = mdAtoms->mdatoms();
+
+    step = 0;
+
+    // Ensure the extra per-atom state array gets allocated
+    state_global->flags |= (1<<estCGP);
+
+    /* Create 4 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_a   = &s1;
+    em_state_t *s_b   = &s2;
+    em_state_t *s_c   = &s3;
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, CG, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_min, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    evaluate_energy(fplog, cr,
+                    top_global, s_min, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+    where();
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        rvec       *pm  = as_rvec_array(s_min->s.cg_p.data());
+        const rvec *sfm = as_rvec_array(s_min->f.data());
+        double      gpa = 0;
+        int         gf  = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    pm[i][m] = sfm[i][m] + beta*pm[i][m];
+                    gpa     -= pm[i][m]*sfm[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    pm[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min->s.x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = pm[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                      top_global, inputrec, step,
+                      s_min, state_global, observablesHistory);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
+                                   s_min, top, mdAtoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, &s_min->s.cg_p, s_c,
+                   constr, top, nrnb, wcycle, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        evaluate_energy(fplog, cr,
+                        top_global, s_c, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdAtoms, fr,
+                        mu_tot, enerd, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        const rvec *pc  = as_rvec_array(s_c->s.cg_p.data());
+        const rvec *sfc = as_rvec_array(s_c->f.data());
+        double      gpc = 0;
+        for (int i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        double gpb;
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdAtoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, &s_min->s.cg_p, s_b,
+                           constr, top, nrnb, wcycle, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                evaluate_energy(fplog, cr,
+                                top_global, s_b, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdAtoms, fr,
+                                mu_tot, enerd, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                const rvec *pb  = as_rvec_array(s_b->s.cg_p.data());
+                const rvec *sfb = as_rvec_array(s_b->f.data());
+                gpb             = 0;
+                for (int i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(&s_b, &s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(&s_b, &s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(&s_b, &s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(&s_b, &s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(&s_b, &s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(&s_min, &s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global, observablesHistory);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+
+/*! \brief Do L-BFGS conjugate gradients minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_membed_t gmx_unused *membed,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_lbfgs(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv,
+                const MdrunOptions &mdrunOptions,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                gmx::IMDOutputProvider *outputProvider,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                ObservablesHistory *observablesHistory,
+                gmx::MDAtoms *mdAtoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                t_forcerec *fr,
+                const ReplicaExchangeParameters gmx_unused &replExParams,
+                gmx_membed_t gmx_unused *membed,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *p, *s, **dx, **dg;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, gf, step;
+    int                mdof_flags;
+    auto               mdatoms = mdAtoms->mdatoms();
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
+    }
+
+    if (nullptr != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, LBFGS, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &ems, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* We need 4 working states */
+    em_state_t  s0 {}, s1 {}, s2 {}, s3 {};
+    em_state_t *sa   = &s0;
+    em_state_t *sb   = &s1;
+    em_state_t *sc   = &s2;
+    em_state_t *last = &s3;
+    /* Initialize by copying the state from ems (we could skip x and f here) */
+    *sa              = ems;
+    *sb              = ems;
+    *sc              = ems;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, as_rvec_array(state_global->x.data()), 1, nullptr,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    evaluate_energy(fplog, cr,
+                    top_global, &ems, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                   nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+    where();
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    real *fInit = static_cast<real *>(as_rvec_array(ems.f.data())[0]);
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = fInit[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/ems.fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, (real)step, &ems.s, state_global, observablesHistory, ems.f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        real *xx = static_cast<real *>(as_rvec_array(ems.s.x.data())[0]);
+        real *ff = static_cast<real *>(as_rvec_array(ems.f.data())[0]);
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        *last       = ems;
+        real *lastx = static_cast<real *>(as_rvec_array(last->s.x.data())[0]);
+        real *lastf = static_cast<real *>(as_rvec_array(last->f.data())[0]);
+        Epot0       = ems.epot;
+
+        *sa         = ems;
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        real *xc = static_cast<real *>(as_rvec_array(sc->s.x.data())[0]);
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        evaluate_energy(fplog, cr,
+                        top_global, sc, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdAtoms, fr,
+                        mu_tot, enerd, vir, pres, step, FALSE);
+
+        // Calc line gradient in position C
+        real *fc = static_cast<real *>(as_rvec_array(sc->f.data())[0]);
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = sqrt(GMX_REAL_EPS)*fabs(sa->epot);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        if (sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp)))
+        {
+            // Great, we found a better energy. We no longer try to alter the
+            // stepsize, but simply accept this new better position. The we select a new
+            // search direction instead, which will be much more efficient than continuing
+            // to take smaller steps along a line. Set fnorm based on the new C position,
+            // which will be used to update the stepsize to 1/fnorm further down.
+            foundlower = TRUE;
+        }
+        else
+        {
+            // If we got here, the energy is NOT lower in point C, i.e. it will be the same
+            // or higher than in point A. In this case it is pointless to move to point C,
+            // so we will have to do more iterations along the same line to find a smaller
+            // value in the interval [A=0.0,C].
+            // Here, A is still 0.0, but that will change when we do a search in the interval
+            // [0.0,C] below. That search we will do by interpolation or bisection rather
+            // than with the stepsize, so no need to modify it. For the next search direction
+            // it will be reset to 1/fnorm anyway.
+            foundlower = FALSE;
+        }
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            real fnorm = 0;
+            nminstep   = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                real *xb = static_cast<real *>(as_rvec_array(sb->s.x.data())[0]);
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                evaluate_energy(fplog, cr,
+                                top_global, sb, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdAtoms, fr,
+                                mu_tot, enerd, vir, pres, step, FALSE);
+                fnorm = sb->fnorm;
+
+                // Calculate gradient in point B
+                real *fb = static_cast<real *>(as_rvec_array(sb->f.data())[0]);
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    c   = b;
+                    /* swap states b and c */
+                    swap_em_state(&sb, &sc);
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    a   = b;
+                    /* swap states a and b */
+                    swap_em_state(&sa, &sb);
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
+
+            if (fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (sc->epot < sa->epot)
+            {
+                /* Use state C */
+                ems        = *sc;
+                step_taken = c;
+            }
+            else
+            {
+                /* Use state A */
+                ems        = *sa;
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            /* Use state C */
+            ems        = *sc;
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                       nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : nullptr, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/ems.fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (ems.fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (ems.fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : nullptr, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, &ems, sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do steepest descents minimization
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_steep(FILE *fplog, t_commrec *cr, const gmx::MDLogger gmx_unused &mdlog,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv,
+                const MdrunOptions &mdrunOptions,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                gmx::IMDOutputProvider *outputProvider,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                ObservablesHistory *observablesHistory,
+                gmx::MDAtoms *mdAtoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                t_forcerec *fr,
+                const ReplicaExchangeParameters gmx_unused &replExParams,
+                gmx_membed_t gmx_unused *membed,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *SD = "Steepest Descents";
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+    auto              mdatoms        = mdAtoms->mdatoms();
+
+    /* Create 2 states on the stack and extract pointers that we will swap */
+    em_state_t  s0 {}, s1 {};
+    em_state_t *s_min = &s0;
+    em_state_t *s_try = &s1;
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, SD, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, s_try, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, nullptr,
+            nfile, fnm, &outf, &mdebin, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        bool validStep = true;
+        if (count > 0)
+        {
+            validStep =
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
+                           s_min, stepsize, &s_min->f, s_try,
+                           constr, top, nrnb, wcycle, count);
+        }
+
+        if (validStep)
+        {
+            evaluate_energy(fplog, cr,
+                            top_global, s_try, top,
+                            inputrec, nrnb, wcycle, gstat,
+                            vsite, constr, fcd, graph, mdAtoms, fr,
+                            mu_tot, enerd, vir, pres, count, count == 0);
+        }
+        else
+        {
+            // Signal constraint error during stepping with energy=inf
+            s_try->epot = std::numeric_limits<real>::infinity();
+        }
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
+                           s_try->s.box, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr);
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(&s_min, &s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr,
+                          top_global, inputrec, count,
+                          s_min, state_global, observablesHistory);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                                       s_min, top, mdAtoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != nullptr);
+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != nullptr);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box,
+                   MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+                   inputrec, 0, wcycle) &&
+            MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global, observablesHistory);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min, sqrtNumAtoms);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do normal modes analysis
+    \copydoc integrator_t(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                          int nfile, const t_filenm fnm[],
+                          const gmx_output_env_t *oenv,
+                          const MdrunOptions &mdrunOptions,
+                          gmx_vsite_t *vsite, gmx_constr_t constr,
+                          gmx::IMDOutputProvider *outputProvider,
+                          t_inputrec *inputrec,
+                          gmx_mtop_t *top_global, t_fcdata *fcd,
+                          t_state *state_global,
+                          gmx::MDAtoms *mdAtoms,
+                          t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                          gmx_edsam_t ed,
+                          t_forcerec *fr,
+                          const ReplicaExchangeParameters &replExParams,
+                          gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_nm(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv,
+             const MdrunOptions &mdrunOptions,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             gmx::IMDOutputProvider *outputProvider,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             ObservablesHistory gmx_unused *observablesHistory,
+             gmx::MDAtoms *mdAtoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             t_forcerec *fr,
+             const ReplicaExchangeParameters gmx_unused &replExParams,
+             gmx_membed_t gmx_unused *membed,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *fneg, *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = nullptr;
+    real           *     full_matrix             = nullptr;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+    auto                      mdatoms   = mdAtoms->mdatoms();
+
+    if (constr != nullptr)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    gmx_shellfc_t *shellfc;
+
+    em_state_t     state_work {};
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, NM, cr, outputProvider, inputrec, mdrunOptions,
+            state_global, top_global, &state_work, &top,
+            nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat,
+            vsite, constr, &shellfc,
+            nfile, fnm, &outf, nullptr, wcycle);
+
+    std::vector<size_t> atom_index = get_atom_index(top_global);
+    snew(fneg, atom_index.size());
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
+    {
+        GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%d), using full Hessian format.",
+                                                   atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+    where();
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
+                *(top_global->name), (int)inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    evaluate_energy(fplog, cr,
+                    top_global, &state_work, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdAtoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
+
+    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
+    if (state_work.fmax > 1.0e-3)
+    {
+        GMX_LOG(mdlog.warning).appendText(
+                "The force is probably not small enough to "
+                "ensure that you are at a minimum.\n"
+                "Be aware that negative eigenvalues may occur\n"
+                "when the resulting matrix is diagonalized.");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS = true;
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            gmx_bool    bBornRadii  = FALSE;
+            gmx_int64_t step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work.s.x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work.s.x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work.s.x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    (void) relax_shell_flexcon(fplog, cr, mdrunOptions.verbose, step,
+                                               inputrec, bNS, force_flags,
+                                               top,
+                                               constr, enerd, fcd,
+                                               &state_work.s, &state_work.f, vir, mdatoms,
+                                               nrnb, wcycle, graph, &top_global->groups,
+                                               shellfc, fr, bBornRadii, t, mu_tot,
+                                               vsite,
+                                               DdOpenBalanceRegionBeforeForceComputation::no,
+                                               DdCloseBalanceRegionAfterForceComputation::no);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    evaluate_energy(fplog, cr,
+                                    top_global, &state_work, top,
+                                    inputrec, nrnb, wcycle, gstat,
+                                    vsite, constr, fcd, graph, mdAtoms, fr,
+                                    mu_tot, enerd, vir, pres, atom*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    for (size_t i = 0; i < atom_index.size(); i++)
+                    {
+                        copy_rvec(state_work.f[atom_index[i]], fneg[i]);
+                    }
+                }
+            }
+
+            /* x is restored to original */
+            state_work.s.x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work.f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && atom+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (atom + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (mdrunOptions.verbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && mdrunOptions.verbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+
+    return 0;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp b/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..132d93d0dc1fed0bd8406176da9b8dc31eb9dbea
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp
@@ -0,0 +1,2242 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "md.h"
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cmath>
+
+#include <algorithm>
+#include <memory>
+
+#include "thread_mpi/threads.h"
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+using gmx::SimulationSignaller;
+
+/*! \brief Check whether bonded interactions are missing, if appropriate
+ *
+ * \param[in]    fplog                                  Log file pointer
+ * \param[in]    cr                                     Communication object
+ * \param[in]    totalNumberOfBondedInteractions        Result of the global reduction over the number of bonds treated in each domain
+ * \param[in]    top_global                             Global topology for the error message
+ * \param[in]    top_local                              Local topology for the error message
+ * \param[in]    state                                  Global state for the error message
+ * \param[inout] shouldCheckNumberOfBondedInteractions  Whether we should do the check.
+ *
+ * \return Nothing, except that shouldCheckNumberOfBondedInteractions
+ * is always set to false after exit.
+ */
+static void checkNumberOfBondedInteractions(FILE *fplog, t_commrec *cr, int totalNumberOfBondedInteractions,
+                                            gmx_mtop_t *top_global, gmx_localtop_t *top_local, t_state *state,
+                                            bool *shouldCheckNumberOfBondedInteractions)
+{
+    if (*shouldCheckNumberOfBondedInteractions)
+    {
+        if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
+        {
+            dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
+        }
+        *shouldCheckNumberOfBondedInteractions = false;
+    }
+}
+
+static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
+                               gmx_int64_t step,
+                               gmx_int64_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_walltime_accounting_t walltime_accounting,
+                               struct nonbonded_verlet_t *nbv,
+                               struct gmx_pme_t *pme)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+            "step %s: resetting all time and cycle counters",
+            gmx_step_str(step, sbuf));
+
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+    }
+
+    if (pme_gpu_task_enabled(pme))
+    {
+        pme_gpu_reset_timings(pme);
+    }
+
+    if (use_GPU(nbv) || pme_gpu_task_enabled(pme))
+    {
+        resetGpuProfiler();
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    walltime_accounting_start(walltime_accounting);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
+}
+
+/*! \brief Copy the state from \p rerunFrame to \p globalState and, if requested, construct vsites
+ *
+ * \param[in]     rerunFrame      The trajectory frame to compute energy/forces for
+ * \param[in,out] globalState     The global state container
+ * \param[in]     constructVsites When true, vsite coordinates are constructed
+ * \param[in]     vsite           Vsite setup, can be nullptr when \p constructVsites = false
+ * \param[in]     idef            Topology parameters, used for constructing vsites
+ * \param[in]     timeStep        Time step, used for constructing vsites
+ * \param[in]     forceRec        Force record, used for constructing vsites
+ * \param[in,out] graph           The molecular graph, used for constructing vsites when != nullptr
+ * \param[in,out] warnWhenNoV     When true, issue a warning when no velocities are present in \p rerunFrame; is set to false when a warning was issued
+ */
+static void prepareRerunState(const t_trxframe  &rerunFrame,
+                              t_state           *globalState,
+                              bool               constructVsites,
+                              const gmx_vsite_t *vsite,
+                              const t_idef      &idef,
+                              double             timeStep,
+                              const t_forcerec  &forceRec,
+                              t_graph           *graph,
+                              gmx_bool          *warnWhenNoV)
+{
+    for (int i = 0; i < globalState->natoms; i++)
+    {
+        copy_rvec(rerunFrame.x[i], globalState->x[i]);
+    }
+    if (rerunFrame.bV)
+    {
+        for (int i = 0; i < globalState->natoms; i++)
+        {
+            copy_rvec(rerunFrame.v[i], globalState->v[i]);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < globalState->natoms; i++)
+        {
+            clear_rvec(globalState->v[i]);
+        }
+        if (*warnWhenNoV)
+        {
+            fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                    "         Ekin, temperature and pressure are incorrect,\n"
+                    "         the virial will be incorrect when constraints are present.\n"
+                    "\n");
+            *warnWhenNoV = FALSE;
+        }
+    }
+    copy_mat(rerunFrame.box, globalState->box);
+
+    if (constructVsites)
+    {
+        GMX_ASSERT(vsite, "Need valid vsite for constructing vsites");
+
+        if (graph)
+        {
+            /* Following is necessary because the graph may get out of sync
+             * with the coordinates if we only have every N'th coordinate set
+             */
+            mk_mshift(nullptr, graph, forceRec.ePBC, globalState->box, as_rvec_array(globalState->x.data()));
+            shift_self(graph, globalState->box, as_rvec_array(globalState->x.data()));
+        }
+        construct_vsites(vsite, as_rvec_array(globalState->x.data()), timeStep, as_rvec_array(globalState->v.data()),
+                         idef.iparams, idef.il,
+                         forceRec.ePBC, forceRec.bMolPBC, nullptr, globalState->box);
+        if (graph)
+        {
+            unshift_self(graph, globalState->box, as_rvec_array(globalState->x.data()));
+        }
+    }
+}
+
+/*! \libinternal
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv,
+                           const MdrunOptions &mdrunOptions,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           gmx::IMDOutputProvider *outputProvider,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           t_forcerec *fr,
+                           const ReplicaExchangeParameters &replExParams,
+                           gmx_membed_t *membed,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                  int nfile, const t_filenm fnm[],
+                  const gmx_output_env_t *oenv,
+                  const MdrunOptions &mdrunOptions,
+                  gmx_vsite_t *vsite, gmx_constr_t constr,
+                  gmx::IMDOutputProvider *outputProvider,
+                  t_inputrec *ir,
+                  gmx_mtop_t *top_global,
+                  t_fcdata *fcd,
+                  t_state *state_global,
+                  ObservablesHistory *observablesHistory,
+                  gmx::MDAtoms *mdAtoms,
+                  t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                  t_forcerec *fr,
+                  const ReplicaExchangeParameters &replExParams,
+                  gmx_membed_t *membed,
+                  gmx_walltime_accounting_t walltime_accounting)
+{
+    gmx_mdoutf_t    outf = nullptr;
+    gmx_int64_t     step, step_rel;
+    double          elapsed_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM,
+                    bFirstStep, bInitStep, bLastStep = FALSE,
+                    bBornRadii;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    matrix            parrinellorahmanMu, M;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = nullptr;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin   = nullptr;
+    gmx_enerdata_t   *enerd;
+    PaddedRVecVector  f {};
+    gmx_global_stat_t gstat;
+    gmx_update_t     *upd   = nullptr;
+    t_graph          *graph = nullptr;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind;
+    gmx_shellfc_t    *shellfc;
+    gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bTemp, bPres, bTrotter;
+    real              dvdl_constr;
+    rvec             *cbuf        = nullptr;
+    int               cbuf_nalloc = 0;
+    matrix            lastbox;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep = 0;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    int plumedWantsToStop=0;
+    matrix plumed_vir;
+    /* END PLUMED */
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+    if (mdrunOptions.timingOptions.resetHalfway)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resethway functionality is deprecated, and may be removed in a future version.");
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (mdrunOptions.maximumHoursToRun > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const gmx_bool bRerunMD      = mdrunOptions.rerun;
+    int            nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (bRerunMD)
+    {
+        ir->nstxout_compressed = 0;
+    }
+    groups = &top_global->groups;
+
+    gmx_edsam *ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    if (ir->eSwapCoords != eswapNO)
+    {
+        /* Initialize ion swapping code */
+        init_swapcoords(fplog, ir, opt2fn_master("-swap", nfile, fnm, cr),
+                        top_global,
+                        state_global, observablesHistory,
+                        cr, oenv, mdrunOptions);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
+    }
+    if (shellfc && ir->bDoAwh)
+    {
+        gmx_fatal(FARGS, "AWH biasing does not support shell particles.");
+    }
+
+    if (inputrecDeform(ir))
+    {
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = gmx::compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        f.resize(gmx::paddedRVecVectorSize(state_global->natoms));
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+            }
+            else if (observablesHistory->energyHistory.get() != nullptr)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+            }
+        }
+        if (observablesHistory->energyHistory.get() == nullptr)
+        {
+            observablesHistory->energyHistory = std::unique_ptr<energyhistory_t>(new energyhistory_t {});
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    /* Initialize constraints */
+    if (constr && !DOMAINDECOMP(cr))
+    {
+        set_constraints(constr, top, ir, mdatoms, cr);
+    }
+
+    /* Initialize AWH and restore state from history in checkpoint if needed. */
+    if (ir->bDoAwh)
+    {
+        ir->awh = new gmx::Awh(fplog, *ir, cr, *ir->awhParams, opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+        if (startingFromCheckpoint)
+        {
+            /* Restore the AWH history read from checkpoint */
+            ir->awh->restoreStateFromHistory(MASTER(cr) ? state_global->awhHistory.get() : nullptr);
+        }
+        else if (MASTER(cr))
+        {
+            /* Initialize the AWH history here */
+            state_global->awhHistory = ir->awh->initHistoryFromState();
+        }
+    }
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME, or for
+     * reruns. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !bRerunMD &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
+                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (state->flags & (1 << estV))
+        {
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(state->v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            state->v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state,
+                               cr, nrnb, fr, top);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr_rmsd(constr));
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      /* detect plumed API version */
+      int pversion=0;
+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
+      /* setting kbT is only implemented with api>1) */
+      real kbT=ir->opts.ref_t[0]*BOLTZ;
+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
+      if(pversion>2){
+        int res=1;
+        if( (continuationOptions.startedFromCheckpoint) ) plumed_cmd(plumedmain,"setRestart",&res);
+      }
+
+      if(cr->ms && cr->ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+        }
+      }
+    }
+    /* END PLUMED */
+
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bLastStep = !read_first_frame(oenv, &status,
+                                          opt2fn("-rerun", nfile, fnm),
+                                          &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < gmx::square(fr->rlist))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* Loop over MD steps or if rerunMD to end of input trajectory,
+     * or, if max_hours>0, until max_hours is reached.
+     */
+    real max_hours   = mdrunOptions.maximumHoursToRun;
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || (cr->ms && fcd->orires.nr);
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && cr->ms);
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState     = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim || (plumedswitch && cr->ms); // PLUMED hack, if we have multiple sim and plumed we usually want them to be in sync 
+        bool resetCountersIsLocal = true;
+        signals[eglsCHKPT]         = SimulationSignal(!simulationsShareState);
+        signals[eglsSTOPCOND]      = SimulationSignal(!simulationsShareState);
+        signals[eglsRESETCOUNTERS] = SimulationSignal(resetCountersIsLocal);
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && MULTISIM(cr) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(cr->ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(cr->ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           ir, fr, state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+            if (bRerunMD)
+            {
+                if (MASTER(cr))
+                {
+                    setCurrentLambdasRerun(step, ir->fepvals, &rerun_fr, lam0, state_global);
+                }
+            }
+            else
+            {
+                setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+            }
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        if (bRerunMD && MASTER(cr))
+        {
+            const bool constructVsites = (vsite && mdrunOptions.rerunConstructVsites);
+            if (constructVsites && DOMAINDECOMP(cr))
+            {
+                gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
+            }
+            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top->idef, ir->delta_t, *fr, graph, &bRerunWarnNoV);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching */
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+        }
+
+        /* < 0 means stop at next step, > 0 means stop at next NS step */
+        if ( (signals[eglsSTOPCOND].set < 0) ||
+             ( (signals[eglsSTOPCOND].set > 0 ) && ( bNS || ir->nstlist == 0)))
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep || bRerunMD;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep || bRerunMD);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (inputrecDynamicBox(ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+
+                /* PLUMED */
+                if(plumedswitch){
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+                }
+                /* END PLUMED */
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* PLUMED HREX */
+        gmx_bool bHREX = bDoReplEx && plumed_hrex;
+
+        if (plumedswitch && bHREX) {
+          gmx_enerdata_t *hrex_enerd;
+          snew(hrex_enerd, 1);
+          init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,hrex_enerd);
+          int repl  = -1;
+          int nrepl = -1;
+          if (MASTER(cr)){
+            repl  = replica_exchange_get_repl(repl_ex);
+            nrepl = replica_exchange_get_nrepl(repl_ex);
+          }
+
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(cr->ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(cr->ms,repl+1,state_global);
+            }
+          }
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+            }
+          }
+          do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
+                   state->box, state->x, &state->hist,
+                   f, force_vir, mdatoms, hrex_enerd, fcd,
+                   state->lambda, graph,
+                   fr, vsite, mu_tot, t, ed, bBornRadii,
+                   GMX_FORCE_STATECHANGED |
+                   GMX_FORCE_DYNAMICBOX |
+                   GMX_FORCE_ALLFORCES |
+                   GMX_FORCE_VIRIAL |
+                   GMX_FORCE_ENERGY |
+                   GMX_FORCE_DHDL |
+                   GMX_FORCE_NS,
+                   ddOpenBalanceRegion, ddCloseBalanceRegion);
+
+          plumed_cmd(plumedmain,"GREX cacheLocalUSwap",&hrex_enerd->term[F_EPOT]);
+          sfree(hrex_enerd);
+
+          /* exchange back */
+          if (DOMAINDECOMP(cr)) {
+            dd_collect_state(cr->dd,state,state_global);
+          } else {
+            copy_state_serial(state, state_global);
+          }
+
+          if(MASTER(cr)){
+            if(repl%2==step/replExParams.exchangeInterval%2){
+              if(repl-1>=0) exchange_state(cr->ms,repl-1,state_global);
+            }else{
+              if(repl+1<nrepl) exchange_state(cr->ms,repl+1,state_global);
+            }
+          }
+
+          if (!DOMAINDECOMP(cr)) {
+            copy_state_serial(state_global, state);
+          }
+          if(PAR(cr)){
+            if (DOMAINDECOMP(cr)) {
+              dd_partition_system(fplog,step,cr,TRUE,1,
+                                  state_global,top_global,ir,
+                                  state,&f,mdAtoms,top,fr,vsite,constr,
+                                  nrnb,wcycle,FALSE);
+              plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+              plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+            }
+          }
+        }
+        /* END PLUMED HREX */
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((signals[eglsCHKPT].set && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && mdrunOptions.writeConfout)) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            signals[eglsCHKPT].set = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            /* TODO: This is probably not what we want, we will write to energy file one step after nstcalcenergy steps. */
+            bCalcEnerStep = do_per_step(step - 1, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep || bRerunMD);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, mdrunOptions.verbose, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, &f, force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, bBornRadii, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (ir->bDoAwh && bCPT && MASTER(cr))
+            {
+                ir->awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+
+            /* PLUMED */
+            plumedNeedsEnergy=0;
+            if(plumedswitch){
+              int pversion=0;
+              plumed_cmd(plumedmain,"getApiVersion",&pversion);
+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
+              plumed_cmd(plumedmain,"prepareCalc",NULL);
+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
+              int checkp=0; if(bCPT) checkp=1;
+              if(pversion>3) plumed_cmd(plumedmain,"doCheckPoint",&checkp);
+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+              if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
+              clear_mat(plumed_vir);
+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
+            }
+            /* END PLUMED */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, ed, bBornRadii,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+            /* PLUMED */
+            if(plumedswitch){
+              if(plumedNeedsEnergy){
+                msmul(force_vir,2.0,plumed_vir);
+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+                plumed_cmd(plumedmain,"performCalc",NULL);
+                msmul(plumed_vir,0.5,force_vir);
+              } else {
+                msmul(plumed_vir,0.5,plumed_vir);
+                m_add(force_vir,plumed_vir,force_vir);
+              }
+              if(bDoReplEx) plumed_cmd(plumedmain,"GREX savePositions",NULL);
+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
+              if(bHREX) plumed_cmd(plumedmain,"GREX cacheLocalUNow",&enerd->term[F_EPOT]);
+            }
+            /* END PLUMED */
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(as_rvec_array(state->v.data()), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, as_rvec_array(state->v.data()), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO && !bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, as_rvec_array(state->v.data()), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 &nchkpt,
+                                 bCPT, bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, as_rvec_array(state->x.data()), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#if GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            int nsteps_stop = -1;
+
+            /* this just makes signals[].sig compatible with the hack
+               of sending signals around by MPI_Reduce together with
+               other floats */
+            if ((gmx_get_stop_condition() == gmx_stop_cond_next_ns) ||
+                (mdrunOptions.reproducible &&
+                 gmx_get_stop_condition() == gmx_stop_cond_next))
+            {
+                /* We need at least two global communication steps to pass
+                 * around the signal. We stop at a pair-list creation step
+                 * to allow for exact continuation, when possible.
+                 */
+                signals[eglsSTOPCOND].sig = 1;
+                nsteps_stop               = std::max(ir->nstlist, 2*nstSignalComm);
+            }
+            else if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                /* Stop directly after the next global communication step.
+                 * This breaks exact continuation.
+                 */
+                signals[eglsSTOPCOND].sig = -1;
+                nsteps_stop               = nstSignalComm + 1;
+            }
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                        gmx_get_signal_name(), nsteps_stop);
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                    gmx_get_signal_name(), nsteps_stop);
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
+                 signals[eglsSTOPCOND].sig == 0 && signals[eglsSTOPCOND].set == 0)
+        {
+            /* Signal to terminate the run */
+            signals[eglsSTOPCOND].sig = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            elapsed_time > max_hours*60.0*60.0*0.495)
+        {
+            /* Set flag that will communicate the signal to all ranks in the simulation */
+            signals[eglsRESETCOUNTERS].sig = 1;
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        const real cpt_period = mdrunOptions.checkpointOptions.period;
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
+            signals[eglsCHKPT].set == 0)
+        {
+            signals[eglsCHKPT].sig = 1;
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+        {
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple_before_coordinates(fplog, step, ir, state,
+                                                  parrinellorahmanMu, M,
+                                                  bInitStep);
+            }
+
+            if (EI_VV(ir->eI))
+            {
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtVELOCITY2,
+                              cr, constr);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
+
+            if (ir->eI == eiVVAK)
+            {
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
+                {
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
+                }
+                copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, lastbox,
+                                nullptr, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtPOSITION, cr, constr);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, nullptr,
+                                   FALSE, bCalcVir);
+            }
+            if (EI_VV(ir->eI))
+            {
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+            }
+            else
+            {
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+            }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, as_rvec_array(state->v.data()),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && (!EI_VV(ir->eI) || bRerunMD))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), ir->awh);
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             bRerunMD ? rerun_fr.x   : as_rvec_array(state->x.data()),
+                                             bRerunMD ? rerun_fr.box : state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bLastStep = !read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+            }
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        /* TODO make a counter-reset module */
+        /* If it is time to reset counters, set a flag that remains
+           true until counters actually get reset */
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            signals[eglsRESETCOUNTERS].set != 0)
+        {
+            if (pme_loadbal_is_active(pme_loadbal))
+            {
+                /* Do not permit counter reset while PME load
+                 * balancing is active. The only purpose for resetting
+                 * counters is to measure reliable performance data,
+                 * and that can't be done before balancing
+                 * completes.
+                 *
+                 * TODO consider fixing this by delaying the reset
+                 * until after load balancing completes,
+                 * e.g. https://gerrit.gromacs.org/#/c/4964/2 */
+                gmx_fatal(FARGS, "PME tuning was still active when attempting to "
+                          "reset mdrun counters at step %" GMX_PRId64 ". Try "
+                          "resetting counters later in the run, e.g. with gmx "
+                          "mdrun -resetstep.", step);
+            }
+            reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
+                               use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!thisRankHasDuty(cr, DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= elapsed_time/(60.0*60.0);
+            /* If mdrun -maxh -resethway was active, it can only trigger once */
+            bResetCountersHalfMaxH    = FALSE; /* TODO move this to where signals[eglsRESETCOUNTERS].sig is set */
+            /* Reset can only happen once, so clear the triggering flag. */
+            signals[eglsRESETCOUNTERS].set = 0;
+        }
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end(walltime_accounting);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trx(status);
+    }
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), ir->awh);
+        }
+    }
+    // TODO Enable the next line when merging to master branch
+    // done_ebin(mdebin->ebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    if (ir->bDoAwh)
+    {
+        delete ir->awh;
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* Do essential dynamics cleanup if needed. Close .edo file */
+    done_ed(&ed);
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+    if (step_rel >= wcycle_get_reset_counters(wcycle) &&
+        signals[eglsRESETCOUNTERS].set == 0 &&
+        !bResetCountersHalfMaxH)
+    {
+        walltime_accounting_set_valid_finish(walltime_accounting);
+    }
+
+    return 0;
+}
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..1695e0b72d53729f4ece973596131895d9a122b8
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/md.cpp.preplumed
@@ -0,0 +1,2049 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "md.h"
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cmath>
+
+#include <algorithm>
+#include <memory>
+
+#include "thread_mpi/threads.h"
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/expanded.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdsetup.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/simulationsignal.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/awh-history.h"
+#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+using gmx::SimulationSignaller;
+
+/*! \brief Check whether bonded interactions are missing, if appropriate
+ *
+ * \param[in]    fplog                                  Log file pointer
+ * \param[in]    cr                                     Communication object
+ * \param[in]    totalNumberOfBondedInteractions        Result of the global reduction over the number of bonds treated in each domain
+ * \param[in]    top_global                             Global topology for the error message
+ * \param[in]    top_local                              Local topology for the error message
+ * \param[in]    state                                  Global state for the error message
+ * \param[inout] shouldCheckNumberOfBondedInteractions  Whether we should do the check.
+ *
+ * \return Nothing, except that shouldCheckNumberOfBondedInteractions
+ * is always set to false after exit.
+ */
+static void checkNumberOfBondedInteractions(FILE *fplog, t_commrec *cr, int totalNumberOfBondedInteractions,
+                                            gmx_mtop_t *top_global, gmx_localtop_t *top_local, t_state *state,
+                                            bool *shouldCheckNumberOfBondedInteractions)
+{
+    if (*shouldCheckNumberOfBondedInteractions)
+    {
+        if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
+        {
+            dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
+        }
+        *shouldCheckNumberOfBondedInteractions = false;
+    }
+}
+
+static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
+                               gmx_int64_t step,
+                               gmx_int64_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_walltime_accounting_t walltime_accounting,
+                               struct nonbonded_verlet_t *nbv,
+                               struct gmx_pme_t *pme)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+            "step %s: resetting all time and cycle counters",
+            gmx_step_str(step, sbuf));
+
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+    }
+
+    if (pme_gpu_task_enabled(pme))
+    {
+        pme_gpu_reset_timings(pme);
+    }
+
+    if (use_GPU(nbv) || pme_gpu_task_enabled(pme))
+    {
+        resetGpuProfiler();
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    walltime_accounting_start(walltime_accounting);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
+}
+
+/*! \brief Copy the state from \p rerunFrame to \p globalState and, if requested, construct vsites
+ *
+ * \param[in]     rerunFrame      The trajectory frame to compute energy/forces for
+ * \param[in,out] globalState     The global state container
+ * \param[in]     constructVsites When true, vsite coordinates are constructed
+ * \param[in]     vsite           Vsite setup, can be nullptr when \p constructVsites = false
+ * \param[in]     idef            Topology parameters, used for constructing vsites
+ * \param[in]     timeStep        Time step, used for constructing vsites
+ * \param[in]     forceRec        Force record, used for constructing vsites
+ * \param[in,out] graph           The molecular graph, used for constructing vsites when != nullptr
+ * \param[in,out] warnWhenNoV     When true, issue a warning when no velocities are present in \p rerunFrame; is set to false when a warning was issued
+ */
+static void prepareRerunState(const t_trxframe  &rerunFrame,
+                              t_state           *globalState,
+                              bool               constructVsites,
+                              const gmx_vsite_t *vsite,
+                              const t_idef      &idef,
+                              double             timeStep,
+                              const t_forcerec  &forceRec,
+                              t_graph           *graph,
+                              gmx_bool          *warnWhenNoV)
+{
+    for (int i = 0; i < globalState->natoms; i++)
+    {
+        copy_rvec(rerunFrame.x[i], globalState->x[i]);
+    }
+    if (rerunFrame.bV)
+    {
+        for (int i = 0; i < globalState->natoms; i++)
+        {
+            copy_rvec(rerunFrame.v[i], globalState->v[i]);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < globalState->natoms; i++)
+        {
+            clear_rvec(globalState->v[i]);
+        }
+        if (*warnWhenNoV)
+        {
+            fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                    "         Ekin, temperature and pressure are incorrect,\n"
+                    "         the virial will be incorrect when constraints are present.\n"
+                    "\n");
+            *warnWhenNoV = FALSE;
+        }
+    }
+    copy_mat(rerunFrame.box, globalState->box);
+
+    if (constructVsites)
+    {
+        GMX_ASSERT(vsite, "Need valid vsite for constructing vsites");
+
+        if (graph)
+        {
+            /* Following is necessary because the graph may get out of sync
+             * with the coordinates if we only have every N'th coordinate set
+             */
+            mk_mshift(nullptr, graph, forceRec.ePBC, globalState->box, as_rvec_array(globalState->x.data()));
+            shift_self(graph, globalState->box, as_rvec_array(globalState->x.data()));
+        }
+        construct_vsites(vsite, as_rvec_array(globalState->x.data()), timeStep, as_rvec_array(globalState->v.data()),
+                         idef.iparams, idef.il,
+                         forceRec.ePBC, forceRec.bMolPBC, nullptr, globalState->box);
+        if (graph)
+        {
+            unshift_self(graph, globalState->box, as_rvec_array(globalState->x.data()));
+        }
+    }
+}
+
+/*! \libinternal
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv,
+                           const MdrunOptions &mdrunOptions,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           gmx::IMDOutputProvider *outputProvider,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           t_forcerec *fr,
+                           const ReplicaExchangeParameters &replExParams,
+                           gmx_membed_t *membed,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
+                  int nfile, const t_filenm fnm[],
+                  const gmx_output_env_t *oenv,
+                  const MdrunOptions &mdrunOptions,
+                  gmx_vsite_t *vsite, gmx_constr_t constr,
+                  gmx::IMDOutputProvider *outputProvider,
+                  t_inputrec *ir,
+                  gmx_mtop_t *top_global,
+                  t_fcdata *fcd,
+                  t_state *state_global,
+                  ObservablesHistory *observablesHistory,
+                  gmx::MDAtoms *mdAtoms,
+                  t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                  t_forcerec *fr,
+                  const ReplicaExchangeParameters &replExParams,
+                  gmx_membed_t *membed,
+                  gmx_walltime_accounting_t walltime_accounting)
+{
+    gmx_mdoutf_t    outf = nullptr;
+    gmx_int64_t     step, step_rel;
+    double          elapsed_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM,
+                    bFirstStep, bInitStep, bLastStep = FALSE,
+                    bBornRadii;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    matrix            parrinellorahmanMu, M;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = nullptr;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin   = nullptr;
+    gmx_enerdata_t   *enerd;
+    PaddedRVecVector  f {};
+    gmx_global_stat_t gstat;
+    gmx_update_t     *upd   = nullptr;
+    t_graph          *graph = nullptr;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind;
+    gmx_shellfc_t    *shellfc;
+    gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bTemp, bPres, bTrotter;
+    real              dvdl_constr;
+    rvec             *cbuf        = nullptr;
+    int               cbuf_nalloc = 0;
+    matrix            lastbox;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep = 0;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+
+
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = nullptr;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool              shouldCheckNumberOfBondedInteractions = false;
+    int               totalNumberOfBondedInteractions       = -1;
+
+    SimulationSignals signals;
+    // Most global communnication stages don't propagate mdrun
+    // signals, and will use this object to achieve that.
+    SimulationSignaller nullSignaller(nullptr, nullptr, false, false);
+
+    if (!mdrunOptions.writeConfout)
+    {
+        // This is on by default, and the main known use case for
+        // turning it off is for convenience in benchmarking, which is
+        // something that should not show up in the general user
+        // interface.
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
+    }
+    if (mdrunOptions.timingOptions.resetHalfway)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resethway functionality is deprecated, and may be removed in a future version.");
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (mdrunOptions.maximumHoursToRun > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    const gmx_bool bRerunMD      = mdrunOptions.rerun;
+    int            nstglobalcomm = mdrunOptions.globalCommunicationInterval;
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    nstglobalcomm   = check_nstglobalcomm(mdlog, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (bRerunMD)
+    {
+        ir->nstxout_compressed = 0;
+    }
+    groups = &top_global->groups;
+
+    gmx_edsam *ed = nullptr;
+    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
+    {
+        /* Initialize essential dynamics sampling */
+        ed = init_edsam(opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
+                        top_global,
+                        ir, cr, constr,
+                        state_global, observablesHistory,
+                        oenv, mdrunOptions.continuationOptions.appendFiles);
+    }
+
+    if (ir->eSwapCoords != eswapNO)
+    {
+        /* Initialize ion swapping code */
+        init_swapcoords(fplog, ir, opt2fn_master("-swap", nfile, fnm, cr),
+                        top_global,
+                        state_global, observablesHistory,
+                        cr, oenv, mdrunOptions);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
+            &t, &t0, state_global, lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, wcycle);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
+    }
+    if (shellfc && ir->bDoAwh)
+    {
+        gmx_fatal(FARGS, "AWH biasing does not support shell particles.");
+    }
+
+    if (inputrecDeform(ir))
+    {
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    // Local state only becomes valid now.
+    std::unique_ptr<t_state> stateInstance;
+    t_state *                state;
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        stateInstance = gmx::compat::make_unique<t_state>();
+        state         = stateInstance.get();
+        dd_init_local_state(cr->dd, state_global, state);
+    }
+    else
+    {
+        state_change_natoms(state_global, state_global->natoms);
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        f.resize(gmx::paddedRVecVectorSize(state_global->natoms));
+        /* Copy the pointer to the global state */
+        state = state_global;
+
+        snew(top, 1);
+        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+                                  &graph, mdAtoms, vsite, shellfc);
+
+        update_realloc(upd, state->natoms);
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr,
+             nfile, fnm, oenv, mdrunOptions);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdAtoms, top, fr,
+                            vsite, constr,
+                            nrnb, nullptr, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->natoms);
+    }
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    // NOTE: The global state is no longer used at this point.
+    // But state_global is still used as temporary storage space for writing
+    // the global state to file and potentially for replica exchange.
+    // (Global topology should persist.)
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    const ContinuationOptions &continuationOptions    = mdrunOptions.continuationOptions;
+    bool                       startingFromCheckpoint = continuationOptions.startedFromCheckpoint;
+
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(startingFromCheckpoint, ir, state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (continuationOptions.appendFiles)
+            {
+                restore_energyhistory_from_state(mdebin, observablesHistory->energyHistory.get());
+            }
+            else if (observablesHistory->energyHistory.get() != nullptr)
+            {
+                /* We might have read an energy history from checkpoint.
+                 * As we are not appending, we want to restart the statistics.
+                 * Free the allocated memory and reset the counts.
+                 */
+                observablesHistory->energyHistory = {};
+            }
+        }
+        if (observablesHistory->energyHistory.get() == nullptr)
+        {
+            observablesHistory->energyHistory = std::unique_ptr<energyhistory_t>(new energyhistory_t {});
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(observablesHistory->energyHistory.get(), mdebin);
+    }
+
+    /* Initialize constraints */
+    if (constr && !DOMAINDECOMP(cr))
+    {
+        set_constraints(constr, top, ir, mdatoms, cr);
+    }
+
+    /* Initialize AWH and restore state from history in checkpoint if needed. */
+    if (ir->bDoAwh)
+    {
+        ir->awh = new gmx::Awh(fplog, *ir, cr, *ir->awhParams, opt2fn("-awh", nfile, fnm), ir->pull_work);
+
+        if (startingFromCheckpoint)
+        {
+            /* Restore the AWH history read from checkpoint */
+            ir->awh->restoreStateFromHistory(MASTER(cr) ? state_global->awhHistory.get() : nullptr);
+        }
+        else if (MASTER(cr))
+        {
+            /* Initialize the AWH history here */
+            state_global->awhHistory = ir->awh->initHistoryFromState();
+        }
+    }
+
+    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+    if (useReplicaExchange && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, top_global->natoms, ir,
+                                        replExParams);
+    }
+
+    /* PME tuning is only supported in the Verlet scheme, with PME for
+     * Coulomb. It is not supported with only LJ PME, or for
+     * reruns. */
+    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !bRerunMD &&
+                !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
+                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (state->flags & (1 << estV))
+        {
+            /* Set the velocities of vsites, shells and frozen atoms to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                if (mdatoms->ptype[i] == eptVSite ||
+                    mdatoms->ptype[i] == eptShell)
+                {
+                    clear_rvec(state->v[i]);
+                }
+                else if (mdatoms->cFREEZE)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                        {
+                            state->v[i][m] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state,
+                               cr, nrnb, fr, top);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, nullptr,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (useReplicaExchange)
+        {
+            nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (continuationOptions.haveReadEkin)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_INITIALIZATION | CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | (continuationOptions.haveReadEkin ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    /* To minimize communication, compute_globals computes the COM velocity
+     * and the kinetic energy for the velocities without COM motion removed.
+     * Thus to get the kinetic energy without the COM contribution, we need
+     * to call compute_globals twice.
+     */
+    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
+    {
+        int cglo_flags_iteration = cglo_flags;
+        if (bStopCM && cgloIteration == 0)
+        {
+            cglo_flags_iteration |= CGLO_STOPCM;
+            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
+        }
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
+                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    }
+    checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, &nullSignaller, state->box,
+                        nullptr, &bSumEkinhOld,
+                        cglo_flags & ~CGLO_PRESSURE);
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!continuationOptions.startedFromCheckpoint)
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (!ir->bContinuation)
+        {
+            if (constr && ir->eConstrAlg == econtLINCS)
+            {
+                fprintf(fplog,
+                        "RMS relative constraint deviation after constraining: %.2e\n",
+                        constr_rmsd(constr));
+            }
+            if (EI_STATE_VELOCITY(ir->eI))
+            {
+                real temp = enerd->term[F_TEMP];
+                if (ir->eI != eiVV)
+                {
+                    /* Result of Ekin averaged over velocities of -half
+                     * and +half step, while we only have -half step here.
+                     */
+                    temp *= 2;
+                }
+                fprintf(fplog, "Initial temperature: %g K\n", temp);
+            }
+        }
+
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (mdrunOptions.verbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bLastStep = !read_first_frame(oenv, &status,
+                                          opt2fn("-rerun", nfile, fnm),
+                                          &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < gmx::square(fr->rlist))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* Loop over MD steps or if rerunMD to end of input trajectory,
+     * or, if max_hours>0, until max_hours is reached.
+     */
+    real max_hours   = mdrunOptions.maximumHoursToRun;
+    bFirstStep       = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    bool simulationsShareState = false;
+    int  nstSignalComm         = nstglobalcomm;
+    {
+        // TODO This implementation of ensemble orientation restraints is nasty because
+        // a user can't just do multi-sim with single-sim orientation restraints.
+        bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || (cr->ms && fcd->orires.nr);
+        bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && cr->ms);
+
+        // Replica exchange, ensemble restraints and AWH need all
+        // simulations to remain synchronized, so they need
+        // checkpoints and stop conditions to act on the same step, so
+        // the propagation of such signals must take place between
+        // simulations, not just within simulations.
+        // TODO: Make algorithm initializers set these flags.
+        simulationsShareState     = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
+        bool resetCountersIsLocal = true;
+        signals[eglsCHKPT]         = SimulationSignal(!simulationsShareState);
+        signals[eglsSTOPCOND]      = SimulationSignal(!simulationsShareState);
+        signals[eglsRESETCOUNTERS] = SimulationSignal(resetCountersIsLocal);
+
+        if (simulationsShareState)
+        {
+            // Inter-simulation signal communication does not need to happen
+            // often, so we use a minimum of 200 steps to reduce overhead.
+            const int c_minimumInterSimulationSignallingInterval = 200;
+            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
+        }
+    }
+
+    DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion   = (DOMAINDECOMP(cr) ? DdOpenBalanceRegionBeforeForceComputation::yes : DdOpenBalanceRegionBeforeForceComputation::no);
+    DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion  = (DOMAINDECOMP(cr) ? DdCloseBalanceRegionAfterForceComputation::yes : DdCloseBalanceRegionAfterForceComputation::no);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    // TODO extract this to new multi-simulation module
+    if (MASTER(cr) && MULTISIM(cr) && !useReplicaExchange)
+    {
+        if (!multisim_int_all_are_equal(cr->ms, ir->nsteps))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The number of steps is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+        if (!multisim_int_all_are_equal(cr->ms, ir->init_step))
+        {
+            GMX_LOG(mdlog.warning).appendText(
+                    "Note: The initial step is not consistent across multi simulations,\n"
+                    "but we are proceeding anyway!");
+        }
+    }
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
+                           fplog, mdlog,
+                           ir, fr, state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        // TODO Refactor this, so that nstfep does not need a default value of zero
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+            if (bRerunMD)
+            {
+                if (MASTER(cr))
+                {
+                    setCurrentLambdasRerun(step, ir->fepvals, &rerun_fr, lam0, state_global);
+                }
+            }
+            else
+            {
+                setCurrentLambdasLocal(step, ir->fepvals, lam0, state);
+            }
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
+                     do_per_step(step, replExParams.exchangeInterval));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        if (bRerunMD && MASTER(cr))
+        {
+            const bool constructVsites = (vsite && mdrunOptions.rerunConstructVsites);
+            if (constructVsites && DOMAINDECOMP(cr))
+            {
+                gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
+            }
+            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top->idef, ir->delta_t, *fr, graph, &bRerunWarnNoV);
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching */
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+        }
+
+        /* < 0 means stop at next step, > 0 means stop at next NS step */
+        if ( (signals[eglsSTOPCOND].set < 0) ||
+             ( (signals[eglsSTOPCOND].set > 0 ) && ( bNS || ir->nstlist == 0)))
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep || bRerunMD;
+        do_verbose = mdrunOptions.verbose &&
+            (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep || bRerunMD);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (inputrecDynamicBox(ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdAtoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->natoms);
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                            constr, &nullSignaller, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((signals[eglsCHKPT].set && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && mdrunOptions.writeConfout)) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            signals[eglsCHKPT].set = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            /* TODO: This is probably not what we want, we will write to energy file one step after nstcalcenergy steps. */
+            bCalcEnerStep = do_per_step(step - 1, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep || bRerunMD);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, mdrunOptions.verbose, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, &f, force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, bBornRadii, t, mu_tot,
+                                vsite,
+                                ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+        else
+        {
+            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
+               (or the AWH update will be performed twice for one step when continuing). It would be best to
+               call this update function from do_md_trajectory_writing but that would occur after do_force.
+               One would have to divide the update_awh function into one function applying the AWH force
+               and one doing the AWH bias update. The update AWH bias function could then be called after
+               do_md_trajectory_writing (then containing update_awh_history).
+               The checkpointing will in the future probably moved to the start of the md loop which will
+               rid of this issue. */
+            if (ir->bDoAwh && bCPT && MASTER(cr))
+            {
+                ir->awh->updateHistory(state_global->awhHistory.get());
+            }
+
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, ed, bBornRadii,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags,
+                     ddOpenBalanceRegion, ddCloseBalanceRegion);
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = nullptr;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(as_rvec_array(state->v.data()), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    /* TODO This is only needed when we're about to write
+                     * a checkpoint, because we use it after the restart
+                     * (in a kludge?). But what should we be doing if
+                     * startingFromCheckpoint or bInitStep are true? */
+                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
+                    {
+                        copy_mat(shake_vir, state->svir_prev);
+                        copy_mat(force_vir, state->fvir_prev);
+                    }
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
+                                    constr, &nullSignaller, state->box,
+                                    nullptr, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, as_rvec_array(state->v.data()), 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO && !bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, as_rvec_array(state->v.data()), mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            if (MASTER(cr))
+            {
+                copy_df_history(state_global->dfhist, state->dfhist);
+            }
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, observablesHistory,
+                                 top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 &nchkpt,
+                                 bCPT, bRerunMD, bLastStep,
+                                 mdrunOptions.writeConfout,
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, as_rvec_array(state->x.data()), ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#if GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            int nsteps_stop = -1;
+
+            /* this just makes signals[].sig compatible with the hack
+               of sending signals around by MPI_Reduce together with
+               other floats */
+            if ((gmx_get_stop_condition() == gmx_stop_cond_next_ns) ||
+                (mdrunOptions.reproducible &&
+                 gmx_get_stop_condition() == gmx_stop_cond_next))
+            {
+                /* We need at least two global communication steps to pass
+                 * around the signal. We stop at a pair-list creation step
+                 * to allow for exact continuation, when possible.
+                 */
+                signals[eglsSTOPCOND].sig = 1;
+                nsteps_stop               = std::max(ir->nstlist, 2*nstSignalComm);
+            }
+            else if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                /* Stop directly after the next global communication step.
+                 * This breaks exact continuation.
+                 */
+                signals[eglsSTOPCOND].sig = -1;
+                nsteps_stop               = nstSignalComm + 1;
+            }
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                        gmx_get_signal_name(), nsteps_stop);
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                    gmx_get_signal_name(), nsteps_stop);
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
+                 signals[eglsSTOPCOND].sig == 0 && signals[eglsSTOPCOND].set == 0)
+        {
+            /* Signal to terminate the run */
+            signals[eglsSTOPCOND].sig = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            elapsed_time > max_hours*60.0*60.0*0.495)
+        {
+            /* Set flag that will communicate the signal to all ranks in the simulation */
+            signals[eglsRESETCOUNTERS].sig = 1;
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        const real cpt_period = mdrunOptions.checkpointOptions.period;
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
+            signals[eglsCHKPT].set == 0)
+        {
+            signals[eglsCHKPT].sig = 1;
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+        {
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple_before_coordinates(fplog, step, ir, state,
+                                                  parrinellorahmanMu, M,
+                                                  bInitStep);
+            }
+
+            if (EI_VV(ir->eI))
+            {
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtVELOCITY2,
+                              cr, constr);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
+
+            if (ir->eI == eiVVAK)
+            {
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
+                {
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
+                }
+                copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &nullSignaller, lastbox,
+                                nullptr, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, as_rvec_array(state->x.data()), 0, state->natoms);
+
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtPOSITION, cr, constr);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, nullptr, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, nullptr,
+                                   FALSE, bCalcVir);
+            }
+            if (EI_VV(ir->eI))
+            {
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+            }
+            else
+            {
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+            }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+        }
+
+        if (vsite != nullptr)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != nullptr)
+            {
+                shift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, as_rvec_array(state->v.data()),
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != nullptr)
+            {
+                unshift_self(graph, state->box, as_rvec_array(state->x.data()));
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        {
+            // Organize to do inter-simulation signalling on steps if
+            // and when algorithms require it.
+            bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
+
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            {
+                // Since we're already communicating at this step, we
+                // can propagate intra-simulation signals. Note that
+                // check_nstglobalcomm has the responsibility for
+                // choosing the value of nstglobalcomm that is one way
+                // bGStat becomes true, so we can't get into a
+                // situation where e.g. checkpointing can't be
+                // signalled.
+                bool                doIntraSimSignal = true;
+                SimulationSignaller signaller(&signals, cr, doInterSimSignal, doIntraSimSignal);
+
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, &signaller,
+                                lastbox,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                                | CGLO_CONSTRAINT
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                );
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+            }
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && (!EI_VV(ir->eI) || bRerunMD))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+
+        update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
+                                         pres, force_vir, shake_vir,
+                                         parrinellorahmanMu,
+                                         state, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bCalcEner)
+        {
+            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+            /* use the directly determined last velocity, not actually the averaged half steps */
+            if (bTrotter && ir->eI == eiVV)
+            {
+                enerd->term[F_EKIN] = last_ekin;
+            }
+            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+            if (integratorHasConservedEnergyQuantity(ir))
+            {
+                if (EI_VV(ir->eI))
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+                }
+                else
+                {
+                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
+                }
+            }
+            /* #########  END PREPARING EDR OUTPUT  ###########  */
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
+                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), ir->awh);
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             bRerunMD ? rerun_fr.x   : as_rvec_array(state->x.data()),
+                                             bRerunMD ? rerun_fr.box : state->box,
+                                             MASTER(cr) && mdrunOptions.verbose,
+                                             bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdAtoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->natoms);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = false;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != nullptr) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bLastStep = !read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+            }
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        /* TODO make a counter-reset module */
+        /* If it is time to reset counters, set a flag that remains
+           true until counters actually get reset */
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            signals[eglsRESETCOUNTERS].set != 0)
+        {
+            if (pme_loadbal_is_active(pme_loadbal))
+            {
+                /* Do not permit counter reset while PME load
+                 * balancing is active. The only purpose for resetting
+                 * counters is to measure reliable performance data,
+                 * and that can't be done before balancing
+                 * completes.
+                 *
+                 * TODO consider fixing this by delaying the reset
+                 * until after load balancing completes,
+                 * e.g. https://gerrit.gromacs.org/#/c/4964/2 */
+                gmx_fatal(FARGS, "PME tuning was still active when attempting to "
+                          "reset mdrun counters at step %" GMX_PRId64 ". Try "
+                          "resetting counters later in the run, e.g. with gmx "
+                          "mdrun -resetstep.", step);
+            }
+            reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
+                               use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!thisRankHasDuty(cr, DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= elapsed_time/(60.0*60.0);
+            /* If mdrun -maxh -resethway was active, it can only trigger once */
+            bResetCountersHalfMaxH    = FALSE; /* TODO move this to where signals[eglsRESETCOUNTERS].sig is set */
+            /* Reset can only happen once, so clear the triggering flag. */
+            signals[eglsRESETCOUNTERS].set = 0;
+        }
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end(walltime_accounting);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trx(status);
+    }
+
+    if (!thisRankHasDuty(cr, DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts), ir->awh);
+        }
+    }
+    // TODO Enable the next line when merging to master branch
+    // done_ebin(mdebin->ebin);
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, fplog, mdlog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (useReplicaExchange && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    if (ir->bDoAwh)
+    {
+        delete ir->awh;
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    /* Do essential dynamics cleanup if needed. Close .edo file */
+    done_ed(&ed);
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+    if (step_rel >= wcycle_get_reset_counters(wcycle) &&
+        signals[eglsRESETCOUNTERS].set == 0 &&
+        !bResetCountersHalfMaxH)
+    {
+        walltime_accounting_set_valid_finish(walltime_accounting);
+    }
+
+    return 0;
+}
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31ce2bc9c914be0f812a121b538c5ae74137d95c
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp
@@ -0,0 +1,586 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \defgroup module_mdrun Implementation of mdrun
+ * \ingroup group_mdrun
+ *
+ * \brief This module contains code that implements mdrun.
+ */
+/*! \internal \file
+ *
+ * \brief This file implements mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "mdrun_main.h"
+#include "repl_ex.h"
+#include "runner.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+/* PLUMED HREX */
+int plumed_hrex;
+/* END PLUMED HREX */
+
+/*! \brief Return whether either of the command-line parameters that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multi") == 0 || strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+//! Implements C-style main function for mdrun
+int gmx_mdrun(int argc, char *argv[])
+{
+    gmx::Mdrunner runner;
+    return runner.mainFunction(argc, argv);
+}
+
+namespace gmx
+{
+
+int Mdrunner::mainFunction(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "[THISMODULE] is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over ranks if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "Running mdrun efficiently in parallel is a complex topic topic,",
+        "many aspects of which are covered in the online User Guide. You",
+        "should look there for practical advice on using many of the options",
+        "available in mdrun.[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [REF].edi[ref]",
+        "file. The [REF].edi[ref] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [REF].xvg[ref] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[REF].mdp[ref] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type used, a table file name must",
+        "be given. For the topology to work, a file name given here must match a",
+        "character sequence before the file extension. That sequence is: an underscore,",
+        "then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals,",
+        "and finally the matching table number index used in the topology.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [REF].mdp[ref] file.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr. It will also terminate the run when non-finite",
+        "forces are present.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
+        "starts from the first step of the [REF].tpr[ref] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]\\*0.99 hours. This option is particularly useful in",
+        "combination with setting [TT]nsteps[tt] to -1 either in the mdp or using the",
+        "similarly named command line option. This results in an infinite run,",
+        "terminated only when the time limit set by [TT]-maxh[tt] is reached (if any)"
+        "or upon receiving a signal."
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM or INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop at the next neighbor search step or at the",
+        "second global communication step, whichever happens later.",
+        "When [TT]mdrun[tt] receives a second TERM or INT signal and",
+        "reproducibility is not requested, it will stop at the first global",
+        "communication step.",
+        "In both cases all the usual output will be written to file and",
+        "a checkpoint file is written at the last step.",
+        "When [TT]mdrun[tt] receives an ABRT signal or the third TERM or INT signal,",
+        "it will abort directly without writing a new checkpoint file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows one to terminate",
+        "the simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
+        "IMD remote can be turned on by [TT]-imdpull[tt].",
+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
+        "pulling is used."
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+
+    /* Command line options */
+    rvec              realddxyz                                               = {0, 0, 0};
+    const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+    { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+    const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+    { nullptr, "auto", "no", "yes", nullptr };
+    const char       *thread_aff_opt_choices[threadaffNR+1] =
+    { nullptr, "auto", "on", "off", nullptr };
+    const char       *nbpu_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    const char       *pme_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    const char       *pme_fft_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    gmx_bool          bTryToAppendFiles     = TRUE;
+    const char       *gpuIdsAvailable       = "";
+    const char       *userGpuTaskAssignment = "";
+
+    ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+    t_pargs           pa[] = {
+
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+          "DD rank order" },
+        { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+          "Number of separate ranks to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI ranks to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+          "Whether mdrun should try to set thread affinities" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The lowest logical core number to which mdrun should pin the first thread" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+          "List of unique GPU device IDs available to use" },
+        { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+          "List of GPU device IDs, mapping each PP task on each node to a device" },
+        { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+        { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+          "HIDDENA string containing a vector of the relative sizes in the x "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+          "HIDDENA string containing a vector of the relative sizes in the y "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+          "HIDDENA string containing a vector of the relative sizes in the z "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+          "Calculate non-bonded interactions on" },
+        { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+        { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+          "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+        { "-pme",     FALSE, etENUM, {pme_opt_choices},
+          "Perform PME calculations on" },
+        { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+          "Perform PME FFT calculations on" },
+        { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+          "Be loud and noisy" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+          "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+        { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-hrex",  FALSE, etBOOL, {&plumed_hrex}, /* PLUMED HREX */
+          "Enable hamiltonian replica exchange" },
+        { "-imdport",    FALSE, etINT, {&imdOptions.port},
+          "HIDDENIMD listening port" },
+        { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+          "HIDDENPause the simulation while no IMD client is connected" },
+        { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+          "HIDDENAllow termination of the simulation from IMD client" },
+        { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+          "HIDDENAllow pulling in the simulation from IMD client" },
+        { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
+        { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    int               rc;
+    char            **multidir = nullptr;
+
+    cr = init_commrec();
+
+    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multi or -multidir, the file names are going to get processed
+    // further (or the working directory changed), so we can't check for their
+    // existence during parsing.  It isn't useful to do any completion based on
+    // file system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    if (!parse_common_args(&argc, argv, PCA_Flags, nfile, fnm, asize(pa), pa,
+                           asize(desc), desc, 0, nullptr, &oenv))
+    {
+        sfree(cr);
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", nfile, fnm))
+    {
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", nfile, fnm);
+    }
+
+
+    if (replExParams.exchangeInterval != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim >= 1)
+    {
+#if !GMX_THREAD_MPI
+        init_multisystem(cr, nmultisim, multidir, nfile, fnm);
+#else
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
+#endif
+    }
+
+    if (!opt2bSet("-cpi", nfile, fnm))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, bTryToAppendFiles, nfile, fnm, &continuationOptions.appendFiles, &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun", nfile, fnm);
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if (MASTER(cr) && !continuationOptions.appendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     continuationOptions.appendFiles, &fplog);
+    }
+    else
+    {
+        fplog = nullptr;
+    }
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = (int)(realddxyz[XX] + 0.5);
+    domdecOptions.numCells[YY] = (int)(realddxyz[YY] + 0.5);
+    domdecOptions.numCells[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+
+    nbpu_opt    = nbpu_opt_choices[0];
+    pme_opt     = pme_opt_choices[0];
+    pme_fft_opt = pme_fft_opt_choices[0];
+
+    /* PLUMED */
+    plumedswitch=0;
+    if (opt2bSet("-plumed",nfile,fnm)) plumedswitch=1;
+    if(plumedswitch){
+      plumedcmd=plumed_cmd;
+      int real_precision=sizeof(real);
+      real energyUnits=1.0;
+      real lengthUnits=1.0;
+      real timeUnits=1.0;
+  
+      if(!plumed_installed()){
+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
+      }
+      plumedmain=plumed_create();
+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
+      // this is not necessary for gromacs units:
+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
+      //
+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,nfile,fnm));
+      plumedswitch=1;
+    }
+    /* PLUMED HREX*/
+    if(getenv("PLUMED_HREX")) plumed_hrex=1;
+    if(plumed_hrex){
+      if(!plumedswitch) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) requires -plumed");
+      if(replExParams.exchangeInterval==0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) replica exchange");
+      if(replExParams.numExchanges!=0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) not compatible with -nex");
+    }
+    /* END PLUMED HREX */
+
+    /* END PLUMED */
+
+    rc = mdrunner();
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !continuationOptions.appendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
+
+} // namespace
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..772aabda3833f87e24f6b67a76891308ec3c7ac9
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/mdrun.cpp.preplumed
@@ -0,0 +1,540 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \defgroup module_mdrun Implementation of mdrun
+ * \ingroup group_mdrun
+ *
+ * \brief This module contains code that implements mdrun.
+ */
+/*! \internal \file
+ *
+ * \brief This file implements mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "mdrun_main.h"
+#include "repl_ex.h"
+#include "runner.h"
+
+/*! \brief Return whether either of the command-line parameters that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multi") == 0 || strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+//! Implements C-style main function for mdrun
+int gmx_mdrun(int argc, char *argv[])
+{
+    gmx::Mdrunner runner;
+    return runner.mainFunction(argc, argv);
+}
+
+namespace gmx
+{
+
+int Mdrunner::mainFunction(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "[THISMODULE] is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over ranks if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "Running mdrun efficiently in parallel is a complex topic topic,",
+        "many aspects of which are covered in the online User Guide. You",
+        "should look there for practical advice on using many of the options",
+        "available in mdrun.[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [REF].edi[ref]",
+        "file. The [REF].edi[ref] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [REF].xvg[ref] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[REF].mdp[ref] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type used, a table file name must",
+        "be given. For the topology to work, a file name given here must match a",
+        "character sequence before the file extension. That sequence is: an underscore,",
+        "then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals,",
+        "and finally the matching table number index used in the topology.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [REF].mdp[ref] file.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr. It will also terminate the run when non-finite",
+        "forces are present.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
+        "starts from the first step of the [REF].tpr[ref] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]\\*0.99 hours. This option is particularly useful in",
+        "combination with setting [TT]nsteps[tt] to -1 either in the mdp or using the",
+        "similarly named command line option. This results in an infinite run,",
+        "terminated only when the time limit set by [TT]-maxh[tt] is reached (if any)"
+        "or upon receiving a signal."
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM or INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop at the next neighbor search step or at the",
+        "second global communication step, whichever happens later.",
+        "When [TT]mdrun[tt] receives a second TERM or INT signal and",
+        "reproducibility is not requested, it will stop at the first global",
+        "communication step.",
+        "In both cases all the usual output will be written to file and",
+        "a checkpoint file is written at the last step.",
+        "When [TT]mdrun[tt] receives an ABRT signal or the third TERM or INT signal,",
+        "it will abort directly without writing a new checkpoint file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows one to terminate",
+        "the simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
+        "IMD remote can be turned on by [TT]-imdpull[tt].",
+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
+        "pulling is used."
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+
+    /* Command line options */
+    rvec              realddxyz                                               = {0, 0, 0};
+    const char       *ddrank_opt_choices[static_cast<int>(DdRankOrder::nr)+1] =
+    { nullptr, "interleave", "pp_pme", "cartesian", nullptr };
+    const char       *dddlb_opt_choices[static_cast<int>(DlbOption::nr)+1] =
+    { nullptr, "auto", "no", "yes", nullptr };
+    const char       *thread_aff_opt_choices[threadaffNR+1] =
+    { nullptr, "auto", "on", "off", nullptr };
+    const char       *nbpu_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    const char       *pme_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    const char       *pme_fft_opt_choices[] =
+    { nullptr, "auto", "cpu", "gpu", nullptr };
+    gmx_bool          bTryToAppendFiles     = TRUE;
+    const char       *gpuIdsAvailable       = "";
+    const char       *userGpuTaskAssignment = "";
+
+    ImdOptions       &imdOptions = mdrunOptions.imdOptions;
+
+    t_pargs           pa[] = {
+
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddrank_opt_choices},
+          "DD rank order" },
+        { "-npme",    FALSE, etINT, {&domdecOptions.numPmeRanks},
+          "Number of separate ranks to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI ranks to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt_choices},
+          "Whether mdrun should try to set thread affinities" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The lowest logical core number to which mdrun should pin the first thread" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&gpuIdsAvailable},
+          "List of unique GPU device IDs available to use" },
+        { "-gputasks",  FALSE, etSTR, {&userGpuTaskAssignment},
+          "List of GPU device IDs, mapping each PP task on each node to a device" },
+        { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&domdecOptions.minimumCommunicationRange},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&domdecOptions.constraintCommunicationRange},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt_choices},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&domdecOptions.dlbScaling},
+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+        { "-ddcsx",   FALSE, etSTR, {&domdecOptions.cellSizeX},
+          "HIDDENA string containing a vector of the relative sizes in the x "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsy",   FALSE, etSTR, {&domdecOptions.cellSizeY},
+          "HIDDENA string containing a vector of the relative sizes in the y "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsz",   FALSE, etSTR, {&domdecOptions.cellSizeZ},
+          "HIDDENA string containing a vector of the relative sizes in the z "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-gcom",    FALSE, etINT, {&mdrunOptions.globalCommunicationInterval},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {nbpu_opt_choices},
+          "Calculate non-bonded interactions on" },
+        { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+        { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme},
+          "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
+        { "-pme",     FALSE, etENUM, {pme_opt_choices},
+          "Perform PME calculations on" },
+        { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices},
+          "Perform PME FFT calculations on" },
+        { "-v",       FALSE, etBOOL, {&mdrunOptions.verbose},
+          "Be loud and noisy" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&mdrunOptions.reproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&mdrunOptions.checkpointOptions.period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
+          "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+        { "-maxh",   FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&replExParams.exchangeInterval},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&replExParams.numExchanges},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&replExParams.randomSeed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-imdport",    FALSE, etINT, {&imdOptions.port},
+          "HIDDENIMD listening port" },
+        { "-imdwait",  FALSE, etBOOL, {&imdOptions.wait},
+          "HIDDENPause the simulation while no IMD client is connected" },
+        { "-imdterm",  FALSE, etBOOL, {&imdOptions.terminatable},
+          "HIDDENAllow termination of the simulation from IMD client" },
+        { "-imdpull",  FALSE, etBOOL, {&imdOptions.pull},
+          "HIDDENAllow pulling in the simulation from IMD client" },
+        { "-rerunvsite", FALSE, etBOOL, {&mdrunOptions.rerunConstructVsites},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-confout", FALSE, etBOOL, {&mdrunOptions.writeConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&mdrunOptions.verboseStepPrintInterval},
+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
+        { "-resetstep", FALSE, etINT, {&mdrunOptions.timingOptions.resetStep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&mdrunOptions.timingOptions.resetHalfway},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    int               rc;
+    char            **multidir = nullptr;
+
+    cr = init_commrec();
+
+    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multi or -multidir, the file names are going to get processed
+    // further (or the working directory changed), so we can't check for their
+    // existence during parsing.  It isn't useful to do any completion based on
+    // file system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    if (!parse_common_args(&argc, argv, PCA_Flags, nfile, fnm, asize(pa), pa,
+                           asize(desc), desc, 0, nullptr, &oenv))
+    {
+        sfree(cr);
+        return 0;
+    }
+
+    // Handle the options that permits the user to either declare
+    // which compatible GPUs are availble for use, or to select a GPU
+    // task assignment. Either could be in an environment variable (so
+    // that there is a way to customize it, when using MPI in
+    // heterogeneous contexts).
+    {
+        // TODO Argument parsing can't handle std::string. We should
+        // fix that by changing the parsing, once more of the roles of
+        // handling, validating and implementing defaults for user
+        // command-line options have been seperated.
+        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
+        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
+
+        const char *env = getenv("GMX_GPU_ID");
+        if (env != nullptr)
+        {
+            if (!hw_opt.gpuIdsAvailable.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            hw_opt.gpuIdsAvailable = env;
+        }
+
+        env = getenv("GMX_GPUTASKS");
+        if (env != nullptr)
+        {
+            if (!hw_opt.userGpuTaskAssignment.empty())
+            {
+                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
+            }
+            hw_opt.userGpuTaskAssignment = env;
+        }
+
+        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
+        {
+            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
+        }
+    }
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt_choices);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", nfile, fnm))
+    {
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", nfile, fnm);
+    }
+
+
+    if (replExParams.exchangeInterval != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multidir)");
+    }
+
+    if (replExParams.numExchanges < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim >= 1)
+    {
+#if !GMX_THREAD_MPI
+        init_multisystem(cr, nmultisim, multidir, nfile, fnm);
+#else
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
+#endif
+    }
+
+    if (!opt2bSet("-cpi", nfile, fnm))
+    {
+        // If we are not starting from a checkpoint we never allow files to be appended
+        // to, since that has caused a ton of strange behaviour and bugs in the past.
+        if (opt2parg_bSet("-append", asize(pa), pa))
+        {
+            // If the user explicitly used the -append option, explain that it is not possible.
+            gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
+        }
+        else
+        {
+            // If the user did not say anything explicit, just disable appending.
+            bTryToAppendFiles = FALSE;
+        }
+    }
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    continuationOptions.appendFilesOptionSet = opt2parg_bSet("-append", asize(pa), pa);
+
+    handleRestart(cr, bTryToAppendFiles, nfile, fnm, &continuationOptions.appendFiles, &continuationOptions.startedFromCheckpoint);
+
+    mdrunOptions.rerun            = opt2bSet("-rerun", nfile, fnm);
+    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if (MASTER(cr) && !continuationOptions.appendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     continuationOptions.appendFiles, &fplog);
+    }
+    else
+    {
+        fplog = nullptr;
+    }
+
+    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
+    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
+    domdecOptions.numCells[XX] = (int)(realddxyz[XX] + 0.5);
+    domdecOptions.numCells[YY] = (int)(realddxyz[YY] + 0.5);
+    domdecOptions.numCells[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+
+    nbpu_opt    = nbpu_opt_choices[0];
+    pme_opt     = pme_opt_choices[0];
+    pme_fft_opt = pme_fft_opt_choices[0];
+
+
+    rc = mdrunner();
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !continuationOptions.appendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
+
+} // namespace
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7cf1e3332a6ad3ef68ee01a1ff6256b5156f248
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp
@@ -0,0 +1,1454 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "repl_ex.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+/* PLUMED HREX */
+extern int plumed_hrex;
+/* END PLUMED HREX */
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulaiton
+#define MSRANK(ms, nodeid)  (nodeid)
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int       repl;        /* replica ID */
+    int       nrepl;       /* total number of replica */
+    real      temp;        /* temperature */
+    int       type;        /* replica exchange type from ere enum */
+    real    **q;           /* quantity, e.g. temperature or lambda; first index is ere, second index is replica ID */
+    gmx_bool  bNPT;        /* use constant pressure and temperature */
+    real     *pres;        /* replica pressures */
+    int      *ind;         /* replica indices */
+    int      *allswaps;    /* used for keeping track of all the replica swaps */
+    int       nst;         /* replica exchange interval (number of steps) */
+    int       nex;         /* number of exchanges per interval */
+    int       seed;        /* random seed */
+    int       nattempt[2]; /* number of even and odd replica change attempts */
+    real     *prob_sum;    /* sum of probabilities */
+    int     **nmoves;      /* number of moves between replicas i and j */
+    int      *nexchange;   /* i-th element of the array is the number of exchanges between replica i-1 and i */
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    /* PLUMED */
+    //bDiff = FALSE;
+    //for (s = 1; s < ms->nsim; s++)
+    //{
+    //    if (qall[s] != qall[0])
+    //    {
+            bDiff = TRUE;
+    //    }
+    //}
+    /* END PLUMED */
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == nullptr || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from MULTISIM(cr). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    /* PLUMED */
+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
+    // in those cases replicas can share the same temperature.
+    /*
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {*/
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                 /*
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+    */
+    /* END PLUMED */
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+/* PLUMED HREX */
+void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+/* END PLUMED HREX */
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, as_rvec_array(state->x.data()), state->natoms);
+    exchange_rvecs(ms, b, as_rvec_array(state->v.data()), state->natoms);
+}
+
+/* PLUMED HREX */
+void copy_state_serial(const t_state *src, t_state *dest)
+/* END PLUMED HREX */
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (as_rvec_array(state->v.data()))
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+/* PLUMED HREX */
+/* this is necessary because with plumed HREX the energy contribution is
+   already taken into account */
+    if(plumed_hrex) delta=0.0;
+/* END PLUMED HREX */
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          gmx_int64_t           step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" GMX_PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    /* PLUMED */
+    int plumed_test_exchange_pattern=0;
+    if(plumed_test_exchange_pattern && plumed_hrex) gmx_fatal(FARGS,"hrex not compatible with ad hoc exchange patterns");
+    /* END PLUMED */
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        /* PLUMED */
+        if(plumedswitch){
+          int partner=re->repl;
+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
+          if(plumed_test_exchange_pattern>0){
+            int *list;
+            snew(list,re->nrepl);
+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
+            plumed_cmd(plumedmain,"getExchangesList",list);
+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
+            sfree(list);
+          }
+
+          for(i=1; i<re->nrepl; i++) {
+            if (i % 2 != m) continue;
+            a = re->ind[i-1];
+            b = re->ind[i];
+            if(re->repl==a) partner=b;
+            if(re->repl==b) partner=a;
+          }
+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
+          plumed_cmd(plumedmain,"GREX calculate",NULL);
+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
+        }
+        /* END PLUMED */
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                /* PLUMED */
+                if(plumedswitch){
+                  real adb,bdb,dplumed;
+                  char buf[300];
+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
+                  delta+=dplumed;
+                  if (bPrint)
+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
+                }
+                /* END PLUMED */
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                  /* PLUMED */
+                  if(!plumed_test_exchange_pattern) {
+                    /* standard neighbour swapping */
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  } else {
+                    /* alternative swapping patterns */
+                    tmp       = pind[a];
+                    pind[a]   = pind[b];
+                    pind[b]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  }
+                  /* END PLUMED */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* PLUMED */
+    if(plumed_test_exchange_pattern>0) {
+      for (i = 0; i < re->nrepl; i++)
+      {
+          re->ind[i] = i;
+      }
+    }
+    /* END PLUMED */
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    /* PLUMED */
+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
+    /* END PLUMED */
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
+
+/* PLUMED HREX */
+int replica_exchange_get_repl(const gmx_repl_ex_t re){
+  return re->repl;
+};
+
+int replica_exchange_get_nrepl(const gmx_repl_ex_t re){
+  return re->nrepl;
+};
+/* END PLUMED HREX */
+
+
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..c216526229bc66d88429507cd65f439eefc39ab6
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.cpp.preplumed
@@ -0,0 +1,1348 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "repl_ex.h"
+
+#include "config.h"
+
+#include <cmath>
+
+#include <random>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulaiton
+#define MSRANK(ms, nodeid)  (nodeid)
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int       repl;        /* replica ID */
+    int       nrepl;       /* total number of replica */
+    real      temp;        /* temperature */
+    int       type;        /* replica exchange type from ere enum */
+    real    **q;           /* quantity, e.g. temperature or lambda; first index is ere, second index is replica ID */
+    gmx_bool  bNPT;        /* use constant pressure and temperature */
+    real     *pres;        /* replica pressures */
+    int      *ind;         /* replica indices */
+    int      *allswaps;    /* used for keeping track of all the replica swaps */
+    int       nst;         /* replica exchange interval (number of steps) */
+    int       nex;         /* number of exchanges per interval */
+    int       seed;        /* random seed */
+    int       nattempt[2]; /* number of even and odd replica change attempts */
+    real     *prob_sum;    /* sum of probabilities */
+    int     **nmoves;      /* number of moves between replicas i and j */
+    int      *nexchange;   /* i-th element of the array is the number of exchanges between replica i-1 and i */
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    bDiff = FALSE;
+    for (s = 1; s < ms->nsim; s++)
+    {
+        if (qall[s] != qall[0])
+        {
+            bDiff = TRUE;
+        }
+    }
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams)
+{
+    real                pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == nullptr || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from MULTISIM(cr). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    /* We only check that the number of atoms in the systms match.
+     * This, of course, do not guarantee that the systems are the same,
+     * but it does guarantee that we can perform replica exchange.
+     */
+    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    const int nst = replExParams.exchangeInterval;
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (replExParams.randomSeed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = replExParams.randomSeed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = replExParams.numExchanges;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
+    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
+    exchange_doubles(ms, b, &state->baros_integral, 1);
+    exchange_rvecs(ms, b, as_rvec_array(state->x.data()), state->natoms);
+    exchange_rvecs(ms, b, as_rvec_array(state->v.data()), state->natoms);
+}
+
+static void copy_state_serial(const t_state *src, t_state *dest)
+{
+    if (dest != src)
+    {
+        /* Currently the local state is always a pointer to the global
+         * in serial, so we should never end up here.
+         * TODO: Implement a (trivial) t_state copy once converted to C++.
+         */
+        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
+    }
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (as_rvec_array(state->v.data()))
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          const gmx_enerdata_t *enerd,
+                          real                  vol,
+                          gmx_int64_t           step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" GMX_PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    rng.restart( step, 0 );
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            // For now this is superfluous, but just in case we ever add more
+            // calls in different branches it is safer to always reset the distribution.
+            uniformNreplDist.reset();
+
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                // roll a number to determine if accepted. For now it is superfluous to
+                // reset, but just in case we ever add more calls in different branches
+                // it is safer to always reset the distribution.
+                uniformRealDist.reset();
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    // roll a number to determine if accepted. For now it is superfluous to
+                    // reset, but just in case we ever add more calls in different branches
+                    // it is safer to always reset the distribution.
+                    uniformRealDist.reset();
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_serial(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_serial(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d4a68001552a89de1175a5069899d1d3c6d353c
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h
@@ -0,0 +1,103 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _repl_ex_h
+#define _repl_ex_h
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/* The parameters for the replica exchange algorithm */
+struct ReplicaExchangeParameters
+{
+    ReplicaExchangeParameters() :
+        exchangeInterval(0),
+        numExchanges(0),
+        randomSeed(-1)
+    {
+    };
+
+    int exchangeInterval; /* Interval in steps at which to attempt exchanges, 0 means no replica exchange */
+    int numExchanges;     /* The number of exchanges to attempt at an exchange step */
+    int randomSeed;       /* The random seed, -1 means generate a seed */
+};
+
+/* Abstract type for replica exchange */
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+/* Should only be called on the master ranks */
+
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          gmx_int64_t step, real time);
+/* Attempts replica exchange, should be called on all ranks.
+ * Returns TRUE if this state has been exchanged.
+ * When running each replica in parallel,
+ * this routine collects the state on the master rank before exchange.
+ * With domain decomposition, the global state after exchange is stored
+ * in state and still needs to be redistributed over the ranks.
+ */
+
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+/* Should only be called on the master ranks */
+
+/* PLUMED HREX */
+extern int replica_exchange_get_repl(const gmx_repl_ex_t re);
+extern int replica_exchange_get_nrepl(const gmx_repl_ex_t re);
+extern void pd_collect_state(const t_commrec *cr, t_state *state);
+extern void exchange_state(const gmx_multisim_t *ms, int b, t_state *state);
+extern void copy_state_serial(const t_state *src, t_state *dest);
+/* END PLUMED HREX */
+
+#endif  /* _repl_ex_h */
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..30c75258cb94373e4a08dba8f16cda339060e599
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/repl_ex.h.preplumed
@@ -0,0 +1,95 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _repl_ex_h
+#define _repl_ex_h
+
+#include <cstdio>
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+struct gmx_enerdata_t;
+struct gmx_multisim_t;
+struct t_commrec;
+struct t_inputrec;
+class t_state;
+
+/* The parameters for the replica exchange algorithm */
+struct ReplicaExchangeParameters
+{
+    ReplicaExchangeParameters() :
+        exchangeInterval(0),
+        numExchanges(0),
+        randomSeed(-1)
+    {
+    };
+
+    int exchangeInterval; /* Interval in steps at which to attempt exchanges, 0 means no replica exchange */
+    int numExchanges;     /* The number of exchanges to attempt at an exchange step */
+    int randomSeed;       /* The random seed, -1 means generate a seed */
+};
+
+/* Abstract type for replica exchange */
+typedef struct gmx_repl_ex *gmx_repl_ex_t;
+
+gmx_repl_ex_t
+init_replica_exchange(FILE                            *fplog,
+                      const gmx_multisim_t            *ms,
+                      int                              numAtomsInSystem,
+                      const t_inputrec                *ir,
+                      const ReplicaExchangeParameters &replExParams);
+/* Should only be called on the master ranks */
+
+gmx_bool replica_exchange(FILE *fplog,
+                          const t_commrec *cr,
+                          gmx_repl_ex_t re,
+                          t_state *state, const gmx_enerdata_t *enerd,
+                          t_state *state_local,
+                          gmx_int64_t step, real time);
+/* Attempts replica exchange, should be called on all ranks.
+ * Returns TRUE if this state has been exchanged.
+ * When running each replica in parallel,
+ * this routine collects the state on the master rank before exchange.
+ * With domain decomposition, the global state after exchange is stored
+ * in state and still needs to be redistributed over the ranks.
+ */
+
+void print_replica_exchange_statistics(FILE *fplog, gmx_repl_ex_t re);
+/* Should only be called on the master ranks */
+
+#endif  /* _repl_ex_h */
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..983d71938a93a381a32d6f191ece082b389d90dd
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp
@@ -0,0 +1,1438 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/integrator.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/minimize.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tpi.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "deform.h"
+#include "md.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+//! First step used in pressure scaling
+gmx_int64_t         deform_init_init_step_tpx;
+//! Initial box for pressure scaling
+matrix              deform_init_box_tpx;
+//! MPI variable for use in pressure scaling
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+namespace gmx
+{
+
+void Mdrunner::reinitializeOnSpawnedThread()
+{
+    // TODO This duplication is formally necessary if any thread might
+    // modify any memory in fnm or the pointers it contains. If the
+    // contents are ever provably const, then we can remove this
+    // allocation (and memory leak).
+    // TODO This should probably become part of a copy constructor for
+    // Mdrunner.
+    fnm = dup_tfn(nfile, fnm);
+
+    cr  = reinitialize_commrec_for_this_thread(cr);
+
+    if (!MASTER(cr))
+    {
+        // Only the master rank writes to the log files
+        fplog = nullptr;
+    }
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course, but those are all
+           const. */
+        gmx::Mdrunner mdrunner = *masterMdrunner;
+        mdrunner.reinitializeOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch)
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+    gmx::Mdrunner spawnedMdrunner = *this;
+    // TODO This duplication is formally necessary if any thread might
+    // modify any memory in fnm or the pointers it contains. If the
+    // contents are ever provably const, then we can remove this
+    // allocation (and memory leak).
+    // TODO This should probably become part of a copy constructor for
+    // Mdrunner.
+    spawnedMdrunner.fnm = dup_tfn(this->nfile, fnm);
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<void*>(&spawnedMdrunner)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}      // namespace
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    gmx_int64_t          nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! \brief Return the correct integrator function.
+static integrator_t *my_integrator(unsigned int ei)
+{
+    switch (ei)
+    {
+        case eiMD:
+        case eiBD:
+        case eiSD1:
+        case eiVV:
+        case eiVVAK:
+            if (!EI_DYNAMICS(ei))
+            {
+                GMX_THROW(APIError("do_md integrator would be called for a non-dynamical integrator"));
+            }
+            return do_md;
+        case eiSteep:
+            return do_steep;
+        case eiCG:
+            return do_cg;
+        case eiNM:
+            return do_nm;
+        case eiLBFGS:
+            return do_lbfgs;
+        case eiTPI:
+        case eiTPIC:
+            if (!EI_TPI(ei))
+            {
+                GMX_THROW(APIError("do_tpi integrator would be called for a non-TPI integrator"));
+            }
+            return do_tpi;
+        case eiSD2_REMOVED:
+            GMX_THROW(NotImplementedError("SD2 integrator has been removed"));
+        default:
+            GMX_THROW(APIError("Non existing integrator selected"));
+    }
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = nullptr;
+    t_forcerec               *fr            = nullptr;
+    t_fcdata                 *fcd           = nullptr;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    gmx_vsite_t              *vsite         = nullptr;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    gmx::MDModules mdModules;
+    t_inputrec     inputrecInstance;
+    t_inputrec    *inputrec = &inputrecInstance;
+    snew(mtop, 1);
+
+    if (mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = nullptr;
+    }
+
+    bool doMembed = opt2bSet("-membed", nfile, fnm);
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    hwinfo = gmx_detect_hardware(mdlog, cr);
+
+    gmx_print_detected_hardware(fplog, cr, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = std::unique_ptr<t_state>(new t_state);
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, globalState.get(), mtop);
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(&hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
+            auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+    }
+    /* END OF CAUTION: cr is now reliable */
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0;
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
+        auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // TODO: Error handling
+    mdModules.assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = std::unique_ptr<t_state>(new t_state);
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, mtop, inputrec, cr, globalState.get(), &(fcd->orires));
+
+    if (inputrecDeform(inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(globalState->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+    }
+
+    if (SIMMASTER(cr) && continuationOptions.appendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     continuationOptions.appendFiles, &fplog);
+        logOwner = buildLogger(fplog, nullptr);
+        mdlog    = logOwner.logger();
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        const rvec *xOnMaster = (SIMMASTER(cr) ? as_rvec_array(globalState->x.data()) : nullptr);
+
+        cr->dd = init_domain_decomposition(fplog, cr, domdecOptions, mdrunOptions,
+                                           mtop, inputrec,
+                                           box, xOnMaster,
+                                           &ddbox, &npme_major, &npme_minor);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+    if (cr->ms && cr->ms->nsim > 1 && !opt2bSet("-multidir", nfile, fnm))
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -multi flag is deprecated, and may be removed in a future version. Please "
+                       "update your workflows to use -multidir instead.");
+    }
+#if GMX_MPI
+    if (MULTISIM(cr))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                cr->ms->sim, cr->ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, pmeRunMode, *mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (EI_TPI(inputrec->eI) &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become node.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, gpuTasksOnThisRank);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (MULTISIM(cr))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(cr->ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(mdlog, nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    gmx_device_info_t *pmeDeviceInfo = nullptr;
+    // This works because only one task of each type is currently permitted.
+    auto               pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    if (pmeGpuTaskMapping != gpuTaskAssignment.end())
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(mdlog, pmeDeviceInfo);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank,
+                                  *hwinfo->hardwareTopology,
+                                  cr, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms> mdAtoms;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules.initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      getFilenm("-tableb", nfile, fnm),
+                      *hwinfo, nonbondedDeviceInfo,
+                      FALSE,
+                      pforce);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            GMX_LOG(mdlog.info).asParagraph().
+                appendText("Large parts of the QM/MM support is deprecated, and may be removed in a future "
+                           "version. Please get in touch with the developers if you find the support useful, "
+                           "as help is needed if the functionality is to continue to be available.");
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        const bool useGpuForPme = (pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed);
+        mdAtoms = makeMDAtoms(fplog, *mtop, *inputrec, useGpuForPme && thisRankHasDuty(cr, DUTY_PME));
+        if (globalState)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, useGpuForPme ? PinningPolicy::CanBePinned : PinningPolicy::CannotBePinned);
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(*mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                rvec *xGlobal = as_rvec_array(globalState->x.data());
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, xGlobal);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(*mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr, npme_major, npme_minor, inputrec,
+                                       mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr, pmeDeviceInfo, mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr),
+                          continuationOptions);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, globalState.get(), mtop, oenv, mdrunOptions);
+        }
+
+        /* Let init_constraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEdsam = (opt2fn_null("-ei", nfile, fnm) != nullptr || observablesHistory.edsamHistory);
+
+        constr = init_constraints(fplog, mtop, inputrec, doEdsam, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        my_integrator(inputrec->eI) (fplog, cr, mdlog, nfile, fnm,
+                                     oenv,
+                                     mdrunOptions,
+                                     vsite, constr,
+                                     mdModules.outputProvider(),
+                                     inputrec, mtop,
+                                     fcd,
+                                     globalState.get(),
+                                     &observablesHistory,
+                                     mdAtoms.get(), nrnb, wcycle, fr,
+                                     replExParams,
+                                     membed,
+                                     walltime_accounting);
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, cr);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && continuationOptions.appendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..ea058f65c702da0d5178639c4f4fa562ca6b24d1
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.cpp.preplumed
@@ -0,0 +1,1426 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/printhardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/integrator.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/minimize.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tpi.h"
+#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/taskassignment/decidegpuusage.h"
+#include "gromacs/taskassignment/resourcedivision.h"
+#include "gromacs/taskassignment/taskassignment.h"
+#include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/filestream.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "deform.h"
+#include "md.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+//! First step used in pressure scaling
+gmx_int64_t         deform_init_init_step_tpx;
+//! Initial box for pressure scaling
+matrix              deform_init_box_tpx;
+//! MPI variable for use in pressure scaling
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+namespace gmx
+{
+
+void Mdrunner::reinitializeOnSpawnedThread()
+{
+    // TODO This duplication is formally necessary if any thread might
+    // modify any memory in fnm or the pointers it contains. If the
+    // contents are ever provably const, then we can remove this
+    // allocation (and memory leak).
+    // TODO This should probably become part of a copy constructor for
+    // Mdrunner.
+    fnm = dup_tfn(nfile, fnm);
+
+    cr  = reinitialize_commrec_for_this_thread(cr);
+
+    if (!MASTER(cr))
+    {
+        // Only the master rank writes to the log files
+        fplog = nullptr;
+    }
+}
+
+/*! \brief The callback used for running on spawned threads.
+ *
+ * Obtains the pointer to the master mdrunner object from the one
+ * argument permitted to the thread-launch API call, copies it to make
+ * a new runner for this thread, reinitializes necessary data, and
+ * proceeds to the simulation. */
+static void mdrunner_start_fn(void *arg)
+{
+    try
+    {
+        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
+        /* copy the arg list to make sure that it's thread-local. This
+           doesn't copy pointed-to items, of course, but those are all
+           const. */
+        gmx::Mdrunner mdrunner = *masterMdrunner;
+        mdrunner.reinitializeOnSpawnedThread();
+        mdrunner.mdrunner();
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/*! \brief Start thread-MPI threads.
+ *
+ * Called by mdrunner() to start a specific number of threads
+ * (including the main thread) for thread-parallel runs. This in turn
+ * calls mdrunner() for each thread. All options are the same as for
+ * mdrunner(). */
+t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch)
+{
+
+    /* first check whether we even need to start tMPI */
+    if (numThreadsToLaunch < 2)
+    {
+        return cr;
+    }
+
+    gmx::Mdrunner spawnedMdrunner = *this;
+    // TODO This duplication is formally necessary if any thread might
+    // modify any memory in fnm or the pointers it contains. If the
+    // contents are ever provably const, then we can remove this
+    // allocation (and memory leak).
+    // TODO This should probably become part of a copy constructor for
+    // Mdrunner.
+    spawnedMdrunner.fnm = dup_tfn(this->nfile, fnm);
+
+#if GMX_THREAD_MPI
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
+                     mdrunner_start_fn, static_cast<void*>(&spawnedMdrunner)) != TMPI_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
+    }
+#else
+    GMX_UNUSED_VALUE(mdrunner_start_fn);
+#endif
+
+    return reinitialize_commrec_for_this_thread(cr);
+}
+
+}      // namespace
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  const matrix                    box,
+                                  bool                            makeGpuPairList,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
+        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
+
+        real               rlist_new;
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != nullptr)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        listSetup.cluster_size_i, listSetup.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
+                                    gmx_int64_t          nsteps_cmdline,
+                                    t_inputrec          *ir)
+{
+    assert(ir);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
+ *
+ * If not, and if a warning may be issued, logs a warning about
+ * falling back to CPU code. With thread-MPI, only the first
+ * call to this function should have \c issueWarning true. */
+static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
+                                               const t_inputrec *ir,
+                                               bool              issueWarning)
+{
+    if (ir->opts.ngener - ir->nwall > 1)
+    {
+        /* The GPU code does not support more than one energy group.
+         * If the user requested GPUs explicitly, a fatal error is given later.
+         */
+        if (issueWarning)
+        {
+            GMX_LOG(mdlog.warning).asParagraph()
+                .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
+                            "For better performance, run on the GPU without energy groups and then do "
+                            "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.");
+        }
+        return false;
+    }
+    return true;
+}
+
+//! \brief Return the correct integrator function.
+static integrator_t *my_integrator(unsigned int ei)
+{
+    switch (ei)
+    {
+        case eiMD:
+        case eiBD:
+        case eiSD1:
+        case eiVV:
+        case eiVVAK:
+            if (!EI_DYNAMICS(ei))
+            {
+                GMX_THROW(APIError("do_md integrator would be called for a non-dynamical integrator"));
+            }
+            return do_md;
+        case eiSteep:
+            return do_steep;
+        case eiCG:
+            return do_cg;
+        case eiNM:
+            return do_nm;
+        case eiLBFGS:
+            return do_lbfgs;
+        case eiTPI:
+        case eiTPIC:
+            if (!EI_TPI(ei))
+            {
+                GMX_THROW(APIError("do_tpi integrator would be called for a non-TPI integrator"));
+            }
+            return do_tpi;
+        case eiSD2_REMOVED:
+            GMX_THROW(NotImplementedError("SD2 integrator has been removed"));
+        default:
+            GMX_THROW(APIError("Non existing integrator selected"));
+    }
+}
+
+//! Initializes the logger for mdrun.
+static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
+{
+    gmx::LoggerBuilder builder;
+    if (fplog != nullptr)
+    {
+        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
+    }
+    if (cr == nullptr || SIMMASTER(cr))
+    {
+        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
+                                &gmx::TextOutputFile::standardError());
+    }
+    return builder.build();
+}
+
+//! Make a TaskTarget from an mdrun argument string.
+static TaskTarget findTaskTarget(const char *optionString)
+{
+    TaskTarget returnValue = TaskTarget::Auto;
+
+    if (strncmp(optionString, "auto", 3) == 0)
+    {
+        returnValue = TaskTarget::Auto;
+    }
+    else if (strncmp(optionString, "cpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Cpu;
+    }
+    else if (strncmp(optionString, "gpu", 3) == 0)
+    {
+        returnValue = TaskTarget::Gpu;
+    }
+    else
+    {
+        GMX_ASSERT(false, "Option string should have been checked for sanity already");
+    }
+
+    return returnValue;
+}
+
+int Mdrunner::mdrunner()
+{
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = nullptr;
+    t_forcerec               *fr            = nullptr;
+    t_fcdata                 *fcd           = nullptr;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    gmx_vsite_t              *vsite         = nullptr;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = nullptr;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    int                       nthreads_pme = 1;
+    gmx_membed_t *            membed       = nullptr;
+    gmx_hw_info_t            *hwinfo       = nullptr;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    gmx::MDModules mdModules;
+    t_inputrec     inputrecInstance;
+    t_inputrec    *inputrec = &inputrecInstance;
+    snew(mtop, 1);
+
+    if (mdrunOptions.continuationOptions.appendFiles)
+    {
+        fplog = nullptr;
+    }
+
+    bool doMembed = opt2bSet("-membed", nfile, fnm);
+    bool doRerun  = mdrunOptions.rerun;
+
+    // Handle task-assignment related user options.
+    EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
+                                               EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
+    std::vector<int>    gpuIdsAvailable;
+    try
+    {
+        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
+        // TODO We could put the GPU IDs into a std::map to find
+        // duplicates, but for the small numbers of IDs involved, this
+        // code is simple and fast.
+        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
+        {
+            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
+            {
+                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
+                {
+                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
+                }
+            }
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    std::vector<int> userGpuTaskAssignment;
+    try
+    {
+        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    auto       nonbondedTarget = findTaskTarget(nbpu_opt);
+    auto       pmeTarget       = findTaskTarget(pme_opt);
+    auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
+    PmeRunMode pmeRunMode      = PmeRunMode::None;
+
+    // Here we assume that SIMMASTER(cr) does not change even after the
+    // threads are started.
+    gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
+    gmx::MDLogger    mdlog(logOwner.logger());
+
+    hwinfo = gmx_detect_hardware(mdlog, cr);
+
+    gmx_print_detected_hardware(fplog, cr, mdlog, hwinfo);
+
+    std::vector<int> gpuIdsToUse;
+    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
+    if (gpuIdsAvailable.empty())
+    {
+        gpuIdsToUse = compatibleGpus;
+    }
+    else
+    {
+        for (const auto &availableGpuId : gpuIdsAvailable)
+        {
+            bool availableGpuIsCompatible = false;
+            for (const auto &compatibleGpuId : compatibleGpus)
+            {
+                if (availableGpuId == compatibleGpuId)
+                {
+                    availableGpuIsCompatible = true;
+                    break;
+                }
+            }
+            if (!availableGpuIsCompatible)
+            {
+                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
+            }
+            gpuIdsToUse.push_back(availableGpuId);
+        }
+    }
+
+    if (fplog != nullptr)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    std::unique_ptr<t_state> globalState;
+
+    if (SIMMASTER(cr))
+    {
+        /* Only the master rank has the global state */
+        globalState = std::unique_ptr<t_state>(new t_state);
+
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, globalState.get(), mtop);
+
+        if (inputrec->cutoff_scheme != ecutsVERLET)
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (!compatibleGpus.empty())
+            {
+                GMX_LOG(mdlog.warning).asParagraph().appendText(
+                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
+            }
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(&hw_opt, cr, domdecOptions.numPmeRanks);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(mdlog, cr,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+    if (GMX_THREAD_MPI && SIMMASTER(cr))
+    {
+        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+        bool useGpuForNonbonded = false;
+        bool useGpuForPme       = false;
+        try
+        {
+            // If the user specified the number of ranks, then we must
+            // respect that, but in default mode, we need to allow for
+            // the number of GPUs to choose the number of ranks.
+
+            useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
+                    (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    inputrec->cutoff_scheme == ecutsVERLET,
+                    gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
+                    hw_opt.nthreads_tmpi);
+            auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
+            auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
+            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
+                    (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
+                    canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+        /* Determine how many thread-MPI ranks to start.
+         *
+         * TODO Over-writing the user-supplied value here does
+         * prevent any possible subsequent checks from working
+         * correctly. */
+        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                &hw_opt,
+                                                gpuIdsToUse,
+                                                useGpuForNonbonded,
+                                                useGpuForPme,
+                                                inputrec, mtop,
+                                                mdlog,
+                                                doMembed);
+
+        // Now start the threads for thread MPI.
+        cr = spawnThreads(hw_opt.nthreads_tmpi);
+        /* The main thread continues here with a new cr. We don't deallocate
+           the old cr because other threads may still be reading it. */
+        // TODO Both master and spawned threads call dup_tfn and
+        // reinitialize_commrec_for_this_thread. Find a way to express
+        // this better.
+    }
+    /* END OF CAUTION: cr is now reliable */
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+    }
+
+    // Now each rank knows the inputrec that SIMMASTER read and used,
+    // and (if applicable) cr->nnodes has been assigned the number of
+    // thread-MPI ranks that have been chosen. The ranks can now all
+    // run the task-deciding functions and will agree on the result
+    // without needing to communicate.
+    //
+    // TODO Should we do the communication in debug mode to support
+    // having an assertion?
+    //
+    // Note that these variables describe only their own node.
+    bool useGpuForNonbonded = false;
+    bool useGpuForPme       = false;
+    try
+    {
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0;
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
+                                                                emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET,
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
+                                                                gpusWereDetected);
+        auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
+        auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
+
+        pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
+        if (pmeRunMode == PmeRunMode::GPU)
+        {
+            if (pmeFftTarget == TaskTarget::Cpu)
+            {
+                pmeRunMode = PmeRunMode::Mixed;
+            }
+        }
+        else if (pmeFftTarget == TaskTarget::Gpu)
+        {
+            gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    // TODO: Error handling
+    mdModules.assignOptionsToModules(*inputrec->params, nullptr);
+
+    if (fplog != nullptr)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    if (SIMMASTER(cr))
+    {
+        /* now make sure the state is initialized and propagated */
+        set_state_entries(globalState.get(), inputrec);
+    }
+
+    /* NM and TPI parallelize over force/energy calculations, not atoms,
+     * so we need to initialize and broadcast the global state.
+     */
+    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    {
+        if (!MASTER(cr))
+        {
+            globalState = std::unique_ptr<t_state>(new t_state);
+        }
+        broadcastStateWithoutDynamics(cr, globalState.get());
+    }
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
+                     domdecOptions.numCells[YY] > 1 ||
+                     domdecOptions.numCells[ZZ] > 1 ||
+                     domdecOptions.numPmeRanks > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (doRerun &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (domdecOptions.numPmeRanks > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        domdecOptions.numPmeRanks = 0;
+    }
+
+    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
+    {
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        domdecOptions.numPmeRanks = 0;
+    }
+    if (useGpuForPme)
+    {
+        if (domdecOptions.numPmeRanks < 0)
+        {
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
+        }
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, globalState.get(), replExParams.exchangeInterval > 0);
+
+    init_orires(fplog, mtop, inputrec, cr, globalState.get(), &(fcd->orires));
+
+    if (inputrecDeform(inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(globalState->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    ObservablesHistory   observablesHistory = {};
+
+    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
+
+    if (continuationOptions.startedFromCheckpoint)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                        cr, domdecOptions.numCells,
+                        inputrec, globalState.get(),
+                        &bReadEkin, &observablesHistory,
+                        continuationOptions.appendFiles,
+                        continuationOptions.appendFilesOptionSet,
+                        mdrunOptions.reproducible);
+
+        if (bReadEkin)
+        {
+            continuationOptions.haveReadEkin = true;
+        }
+    }
+
+    if (SIMMASTER(cr) && continuationOptions.appendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     continuationOptions.appendFiles, &fplog);
+        logOwner = buildLogger(fplog, nullptr);
+        mdlog    = logOwner.logger();
+    }
+
+    if (mdrunOptions.numStepsCommandline > -2)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
+                       "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
+    }
+    /* override nsteps with value set on the commamdline */
+    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(globalState->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Update rlist and nstlist. */
+    if (inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, mtop, box,
+                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        const rvec *xOnMaster = (SIMMASTER(cr) ? as_rvec_array(globalState->x.data()) : nullptr);
+
+        cr->dd = init_domain_decomposition(fplog, cr, domdecOptions, mdrunOptions,
+                                           mtop, inputrec,
+                                           box, xOnMaster,
+                                           &ddbox, &npme_major, &npme_minor);
+        // Note that local state still does not exist yet.
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+    if (cr->ms && cr->ms->nsim > 1 && !opt2bSet("-multidir", nfile, fnm))
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -multi flag is deprecated, and may be removed in a future version. Please "
+                       "update your workflows to use -multidir instead.");
+    }
+#if GMX_MPI
+    if (MULTISIM(cr))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "This is simulation %d out of %d running as a composite GROMACS\n"
+                "multi-simulation job. Setup for this simulation:\n",
+                cr->ms->sim, cr->ms->nsim);
+    }
+    GMX_LOG(mdlog.warning).appendTextFormatted(
+            "Using %d MPI %s\n",
+            cr->nnodes,
+#if GMX_THREAD_MPI
+            cr->nnodes == 1 ? "thread" : "threads"
+#else
+            cr->nnodes == 1 ? "process" : "processes"
+#endif
+            );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update the number of OpenMP threads requested */
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, pmeRunMode, *mtop);
+
+    gmx_omp_nthreads_init(mdlog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt.nthreads_omp,
+                          hw_opt.nthreads_omp_pme,
+                          !thisRankHasDuty(cr, DUTY_PP),
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (EI_TPI(inputrec->eI) &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    // Build a data structure that expresses which kinds of non-bonded
+    // task are handled by this rank.
+    //
+    // TODO Later, this might become a loop over all registered modules
+    // relevant to the mdp inputs, to find those that have such tasks.
+    //
+    // TODO This could move before init_domain_decomposition() as part
+    // of refactoring that separates the responsibility for duty
+    // assignment from setup for communication between tasks, and
+    // setup for tasks handled with a domain (ie including short-ranged
+    // tasks, bonded tasks, etc.).
+    //
+    // Note that in general useGpuForNonbonded, etc. can have a value
+    // that is inconsistent with the presence of actual GPUs on any
+    // rank, and that is not known to be a problem until the
+    // duty of the ranks on a node become node.
+    //
+    // TODO Later we might need the concept of computeTasksOnThisRank,
+    // from which we construct gpuTasksOnThisRank.
+    //
+    // Currently the DD code assigns duty to ranks that can
+    // include PP work that currently can be executed on a single
+    // GPU, if present and compatible.  This has to be coordinated
+    // across PP ranks on a node, with possible multiple devices
+    // or sharing devices on a node, either from the user
+    // selection, or automatically.
+    auto                 haveGpus = !gpuIdsToUse.empty();
+    std::vector<GpuTask> gpuTasksOnThisRank;
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        if (useGpuForNonbonded)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
+            }
+            else if (nonbondedTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+            }
+        }
+    }
+    // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
+    if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
+    {
+        if (useGpuForPme)
+        {
+            if (haveGpus)
+            {
+                gpuTasksOnThisRank.push_back(GpuTask::Pme);
+            }
+            else if (pmeTarget == TaskTarget::Gpu)
+            {
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+            }
+        }
+    }
+
+    GpuTaskAssignment gpuTaskAssignment;
+    try
+    {
+        // Produce the task assignment for this rank.
+        gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
+                                              mdlog, cr, gpuTasksOnThisRank);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    /* Prevent other ranks from continuing after an issue was found
+     * and reported as a fatal error.
+     *
+     * TODO This function implements a barrier so that MPI runtimes
+     * can organize an orderly shutdown if one of the ranks has had to
+     * issue a fatal error in various code already run. When we have
+     * MPI-aware error handling and reporting, this should be
+     * improved. */
+#if GMX_MPI
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+    if (MULTISIM(cr))
+    {
+        if (SIMMASTER(cr))
+        {
+            MPI_Barrier(cr->ms->mpi_comm_masters);
+        }
+        /* We need another barrier to prevent non-master ranks from contiuing
+         * when an error occured in a different simulation.
+         */
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
+                                       cr, mdlog);
+
+    gmx_device_info_t *nonbondedDeviceInfo = nullptr;
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
+        {
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
+            nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
+            init_gpu(mdlog, nonbondedDeviceInfo);
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
+    }
+
+    gmx_device_info_t *pmeDeviceInfo = nullptr;
+    // This works because only one task of each type is currently permitted.
+    auto               pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    if (pmeGpuTaskMapping != gpuTaskAssignment.end())
+    {
+        pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(mdlog, pmeDeviceInfo);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    int numThreadsOnThisRank;
+    /* threads on this MPI process or TMPI thread */
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
+    }
+    else
+    {
+        numThreadsOnThisRank = nthreads_pme;
+    }
+
+    checkHardwareOversubscription(numThreadsOnThisRank,
+                                  *hwinfo->hardwareTopology,
+                                  cr, mdlog);
+
+    if (hw_opt.thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(mdlog, cr,
+                                      &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
+                                numThreadsOnThisRank, nullptr);
+    }
+
+    if (mdrunOptions.timingOptions.resetStep > -1)
+    {
+        GMX_LOG(mdlog.info).asParagraph().
+            appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
+    }
+    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    // Membrane embedding must be initialized before we call init_forcerec()
+    if (doMembed)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
+    }
+
+    std::unique_ptr<MDAtoms> mdAtoms;
+
+    snew(nrnb, 1);
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Initiate forcerecord */
+        fr                 = mk_forcerec();
+        fr->forceProviders = mdModules.initForceProviders();
+        init_forcerec(fplog, mdlog, fr, fcd,
+                      inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      getFilenm("-tableb", nfile, fnm),
+                      *hwinfo, nonbondedDeviceInfo,
+                      FALSE,
+                      pforce);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            GMX_LOG(mdlog.info).asParagraph().
+                appendText("Large parts of the QM/MM support is deprecated, and may be removed in a future "
+                           "version. Please get in touch with the developers if you find the support useful, "
+                           "as help is needed if the functionality is to continue to be available.");
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdAtoms structure.
+         * mdAtoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        const bool useGpuForPme = (pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed);
+        mdAtoms = makeMDAtoms(fplog, *mtop, *inputrec, useGpuForPme && thisRankHasDuty(cr, DUTY_PME));
+        if (globalState)
+        {
+            // The pinning of coordinates in the global state object works, because we only use
+            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
+            // points to the global state object without DD.
+            // FIXME: MD and EM separately set up the local state - this should happen in the same function,
+            // which should also perform the pinning.
+            changePinningPolicy(&globalState->x, useGpuForPme ? PinningPolicy::CanBePinned : PinningPolicy::CannotBePinned);
+        }
+
+        /* Initialize the virtual site communication */
+        vsite = initVsite(*mtop, cr);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                rvec *xGlobal = as_rvec_array(globalState->x.data());
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, xGlobal);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                constructVsitesGlobal(*mtop, globalState->x);
+            }
+        }
+
+        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+    }
+
+    gmx_pme_t *sepPmeData = nullptr;
+    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
+    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
+    gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdAtoms && mdAtoms->mdatoms())
+        {
+            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (thisRankHasDuty(cr, DUTY_PME))
+        {
+            try
+            {
+                pmedata = gmx_pme_init(cr, npme_major, npme_minor, inputrec,
+                                       mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                       mdrunOptions.reproducible,
+                                       ewaldcoeff_q, ewaldcoeff_lj,
+                                       nthreads_pme,
+                                       pmeRunMode, nullptr, pmeDeviceInfo, mdlog);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP))
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr),
+                          continuationOptions);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, globalState.get(), mtop, oenv, mdrunOptions);
+        }
+
+        /* Let init_constraints know whether we have essential dynamics constraints.
+         * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
+         */
+        bool doEdsam = (opt2fn_null("-ei", nfile, fnm) != nullptr || observablesHistory.edsamHistory);
+
+        constr = init_constraints(fplog, mtop, inputrec, doEdsam, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            domdecOptions.checkBondedInteractions,
+                            fr->cginfo_mb);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        my_integrator(inputrec->eI) (fplog, cr, mdlog, nfile, fnm,
+                                     oenv,
+                                     mdrunOptions,
+                                     vsite, constr,
+                                     mdModules.outputProvider(),
+                                     inputrec, mtop,
+                                     fcd,
+                                     globalState.get(),
+                                     &observablesHistory,
+                                     mdAtoms.get(), nrnb, wcycle, fr,
+                                     replExParams,
+                                     membed,
+                                     walltime_accounting);
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, mdlog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : nullptr,
+               pmedata,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+    // Free PME data
+    if (pmedata)
+    {
+        gmx_pme_destroy(pmedata);
+        pmedata = nullptr;
+    }
+
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, cr);
+    free_gpu(nonbondedDeviceInfo);
+    free_gpu(pmeDeviceInfo);
+
+    if (doMembed)
+    {
+        free_membed(membed);
+    }
+
+    gmx_hardware_info_free();
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && continuationOptions.appendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fac7631024391a8e9a5e5a4b77c6e2b8160ea8e
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h
@@ -0,0 +1,184 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2015,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routine running the inetgrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_RUNNER_H
+#define GMX_MDLIB_RUNNER_H
+
+#include <cstdio>
+
+#include <array>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+#include "repl_ex.h"
+
+struct gmx_output_env_t;
+struct ReplicaExchangeParameters;
+struct t_commrec;
+
+namespace gmx
+{
+
+/*! \libinternal \brief Runner object for supporting setup and execution of mdrun.
+ *
+ * This class has responsibility for the lifetime of data structures
+ * that exist for the life of the simulation, e.g. for logging and
+ * communication.
+ *
+ * \todo Most of the attributes should be declared by specific modules
+ * as command-line options. Accordingly, they do not conform to the
+ * naming scheme, because that would make for a lot of noise in the
+ * diff, only to have it change again when the options move to their
+ * modules.
+ *
+ * \todo Preparing logging and MPI contexts could probably be a
+ * higher-level responsibility, so that an Mdrunner would get made
+ * without needing to re-initialize these components (as currently
+ * happens always for the master rank, and differently for the spawned
+ * ranks with thread-MPI).
+ */
+class Mdrunner
+{
+    private:
+        //! Parallelism-related user options.
+        gmx_hw_opt_t             hw_opt;
+        //! Filenames and properties from command-line argument values.
+        std::array<t_filenm, 35> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efDAT, "-plumed",   "plumed",    ffOPTRD },   /* PLUMED */
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+        /*! \brief Filename arguments.
+         *
+         * Provided for compatibility with old C-style code accessing
+         * command-line arguments that are file names. */
+        t_filenm *fnm = filenames.data();
+        /*! \brief Number of filename argument values.
+         *
+         * Provided for compatibility with old C-style code accessing
+         * command-line arguments that are file names. */
+        int nfile = filenames.size();
+        //! Output context for writing text files
+        gmx_output_env_t                *oenv = nullptr;
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Target short-range interations for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *nbpu_opt = nullptr;
+        //! Target long-range interactions for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *pme_opt = nullptr;
+        //! Target long-range interactions FFT/solve stages for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *pme_fft_opt = nullptr;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Number of simulations in multi-simulation set.
+        int                              nmultisim = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+        //! Handle to file used for logging.
+        FILE                            *fplog;
+        //! Handle to communication data structure.
+        t_commrec                       *cr;
+
+    public:
+        /*! \brief Defaulted constructor.
+         *
+         * Note that when member variables are not present in the constructor
+         * member initialization list (which is true for the default constructor),
+         * then they are initialized with any default member initializer specified
+         * when they were declared, or default initialized. */
+        Mdrunner() = default;
+        //! Start running mdrun by calling its C-style main function.
+        int mainFunction(int argc, char *argv[]);
+        /*! \brief Driver routine, that calls the different simulation methods. */
+        int mdrunner();
+        //! Called when thread-MPI spawns threads.
+        t_commrec *spawnThreads(int numThreadsToLaunch);
+        /*! \brief Re-initializes the object after threads spawn.
+         *
+         * \todo Can this be refactored so that the Mdrunner on a spawned thread is
+         * constructed ready to use? */
+        void reinitializeOnSpawnedThread();
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDLIB_RUNNER_H
diff --git a/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h.preplumed b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..51c84e447181a2a65a47deb85bf3e70479a8f198
--- /dev/null
+++ b/patches/gromacs-2018.1.diff/src/programs/mdrun/runner.h.preplumed
@@ -0,0 +1,183 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2015,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declares the routine running the inetgrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_RUNNER_H
+#define GMX_MDLIB_RUNNER_H
+
+#include <cstdio>
+
+#include <array>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/real.h"
+
+#include "repl_ex.h"
+
+struct gmx_output_env_t;
+struct ReplicaExchangeParameters;
+struct t_commrec;
+
+namespace gmx
+{
+
+/*! \libinternal \brief Runner object for supporting setup and execution of mdrun.
+ *
+ * This class has responsibility for the lifetime of data structures
+ * that exist for the life of the simulation, e.g. for logging and
+ * communication.
+ *
+ * \todo Most of the attributes should be declared by specific modules
+ * as command-line options. Accordingly, they do not conform to the
+ * naming scheme, because that would make for a lot of noise in the
+ * diff, only to have it change again when the options move to their
+ * modules.
+ *
+ * \todo Preparing logging and MPI contexts could probably be a
+ * higher-level responsibility, so that an Mdrunner would get made
+ * without needing to re-initialize these components (as currently
+ * happens always for the master rank, and differently for the spawned
+ * ranks with thread-MPI).
+ */
+class Mdrunner
+{
+    private:
+        //! Parallelism-related user options.
+        gmx_hw_opt_t             hw_opt;
+        //! Filenames and properties from command-line argument values.
+        std::array<t_filenm, 34> filenames =
+        {{{ efTPR, nullptr,     nullptr,     ffREAD },
+          { efTRN, "-o",        nullptr,     ffWRITE },
+          { efCOMPRESSED, "-x", nullptr,     ffOPTWR },
+          { efCPT, "-cpi",      nullptr,     ffOPTRD | ffALLOW_MISSING },
+          { efCPT, "-cpo",      nullptr,     ffOPTWR },
+          { efSTO, "-c",        "confout",   ffWRITE },
+          { efEDR, "-e",        "ener",      ffWRITE },
+          { efLOG, "-g",        "md",        ffWRITE },
+          { efXVG, "-dhdl",     "dhdl",      ffOPTWR },
+          { efXVG, "-field",    "field",     ffOPTWR },
+          { efXVG, "-table",    "table",     ffOPTRD },
+          { efXVG, "-tablep",   "tablep",    ffOPTRD },
+          { efXVG, "-tableb",   "table",     ffOPTRDMULT },
+          { efTRX, "-rerun",    "rerun",     ffOPTRD },
+          { efXVG, "-tpi",      "tpi",       ffOPTWR },
+          { efXVG, "-tpid",     "tpidist",   ffOPTWR },
+          { efEDI, "-ei",       "sam",       ffOPTRD },
+          { efXVG, "-eo",       "edsam",     ffOPTWR },
+          { efXVG, "-devout",   "deviatie",  ffOPTWR },
+          { efXVG, "-runav",    "runaver",   ffOPTWR },
+          { efXVG, "-px",       "pullx",     ffOPTWR },
+          { efXVG, "-pf",       "pullf",     ffOPTWR },
+          { efXVG, "-ro",       "rotation",  ffOPTWR },
+          { efLOG, "-ra",       "rotangles", ffOPTWR },
+          { efLOG, "-rs",       "rotslabs",  ffOPTWR },
+          { efLOG, "-rt",       "rottorque", ffOPTWR },
+          { efMTX, "-mtx",      "nm",        ffOPTWR },
+          { efRND, "-multidir", nullptr,     ffOPTRDMULT},
+          { efXVG, "-awh",      "awhinit",   ffOPTRD },
+          { efDAT, "-membed",   "membed",    ffOPTRD },
+          { efTOP, "-mp",       "membed",    ffOPTRD },
+          { efNDX, "-mn",       "membed",    ffOPTRD },
+          { efXVG, "-if",       "imdforces", ffOPTWR },
+          { efXVG, "-swap",     "swapions",  ffOPTWR }}};
+        /*! \brief Filename arguments.
+         *
+         * Provided for compatibility with old C-style code accessing
+         * command-line arguments that are file names. */
+        t_filenm *fnm = filenames.data();
+        /*! \brief Number of filename argument values.
+         *
+         * Provided for compatibility with old C-style code accessing
+         * command-line arguments that are file names. */
+        int nfile = filenames.size();
+        //! Output context for writing text files
+        gmx_output_env_t                *oenv = nullptr;
+        //! Ongoing collection of mdrun options
+        MdrunOptions                     mdrunOptions;
+        //! Options for the domain decomposition.
+        DomdecOptions                    domdecOptions;
+        //! Target short-range interations for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *nbpu_opt = nullptr;
+        //! Target long-range interactions for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *pme_opt = nullptr;
+        //! Target long-range interactions FFT/solve stages for "cpu", "gpu", or "auto". Default is "auto".
+        const char                      *pme_fft_opt = nullptr;
+        //! Command-line override for the duration of a neighbor list with the Verlet scheme.
+        int                              nstlist_cmdline = 0;
+        //! Number of simulations in multi-simulation set.
+        int                              nmultisim = 0;
+        //! Parameters for replica-exchange simulations.
+        ReplicaExchangeParameters        replExParams;
+        //! Print a warning if any force is larger than this (in kJ/mol nm).
+        real                             pforce = -1;
+        //! Handle to file used for logging.
+        FILE                            *fplog;
+        //! Handle to communication data structure.
+        t_commrec                       *cr;
+
+    public:
+        /*! \brief Defaulted constructor.
+         *
+         * Note that when member variables are not present in the constructor
+         * member initialization list (which is true for the default constructor),
+         * then they are initialized with any default member initializer specified
+         * when they were declared, or default initialized. */
+        Mdrunner() = default;
+        //! Start running mdrun by calling its C-style main function.
+        int mainFunction(int argc, char *argv[]);
+        /*! \brief Driver routine, that calls the different simulation methods. */
+        int mdrunner();
+        //! Called when thread-MPI spawns threads.
+        t_commrec *spawnThreads(int numThreadsToLaunch);
+        /*! \brief Re-initializes the object after threads spawn.
+         *
+         * \todo Can this be refactored so that the Mdrunner on a spawned thread is
+         * constructed ready to use? */
+        void reinitializeOnSpawnedThread();
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDLIB_RUNNER_H