diff --git a/CHANGES/Unreleased.txt b/CHANGES/Unreleased.txt
index 6737c1bec938ab8e6dff39aaa4adde1407b446e9..42830a971e78fd615d06f9c09c9fca78290dc2e8 100644
--- a/CHANGES/Unreleased.txt
+++ b/CHANGES/Unreleased.txt
@@ -32,6 +32,7 @@ Changes from version 2.2 which are relevant for users:
   - \ref ROWSUMS
   - \ref COLUMNSUMS
   - \ref UPDATE_IF
+  - \ref PCA
 - New features in MD patches (require repatch):
   - Patch for amber 14 now passes charges with appropriate units (fixes \issue{165}). Notice that
     the patch is still backward compatible with older PLUMED version, but the charges will only be passed
@@ -45,5 +46,7 @@ Changes from version 2.2 which are relevant for users:
   - \ref COMMITTOR can now be used to define multiple basins
   - The number of atoms admitted in \ref BRIDGE has been significantly increased, see \issue{185}.
   - \ref driver now allows --trajectory-stride to be set to zero when reading with --ixtc/--itrr. In this case, step number is read from the trajectory file.
+  - MetaD and PBMetaD can now be restarted from a GRID 
+  - Added keywords TARGET and DAMPFACTOR in \ref METAD
 
 */
diff --git a/CHANGES/v2.2.txt b/CHANGES/v2.2.txt
index 92f1e248a067511d1ada7cc3a4e615f12f921202..11ff0c90c20d4ddf92e0fd696f9ed264a3095730 100644
--- a/CHANGES/v2.2.txt
+++ b/CHANGES/v2.2.txt
@@ -5,7 +5,14 @@
 Version 2.2 (Oct 13, 2015)
 ----------------------------
 
-Include all fixes in branch 2.1 indicated in \ref CHANGES-2-1 .
+Version 2.2 contains several improvements with respect to 2.1. Users currently working with 2.1
+should have a look at the section "Changes leading to incompatible behavior" below and
+might need tiny adjustments in their input files. In 2.2 we restored more features of 1.3
+that were missing in 2.1, so users still working with 1.3 could opt for an upgrade.
+A tutorial explaining how to move from PLUMED 1 to PLUMED 2 is available (see \ref moving).
+
+Below you find a list of all the changes with respect to version 2.1.
+Notice that version 2.2 includes already all the fixes in branch 2.1 up to 2.1.4 indicated in \ref CHANGES-2-1 .
 
 Changes from version 2.1 which are relevant for users:
 - Changes leading to incompatible behavior:
@@ -105,7 +112,7 @@ For developers:
 - ./configure is automatically relaunched when changing ./configure or Makefile.conf. This makes it more robust
   to switch between branches.
 
-Unreleased changes (will be included in 2.2.2)
+Version 2.2.2 (Apr 13, 2016)
 ----------------------------------------------
 
 For users:
@@ -119,7 +126,8 @@ For users:
 - Fixed a bug in the normalization of kernel functions (relevant for \ref HISTOGRAM).
 - Fixed a regression introduced in v2.2 that was making \ref METAD with non-MPI multiple walkers crash
   if reading frequently. See \issue{190}
-- Updated patch for gromacs 5.1
+- Updated patch for gromacs 5.x. Patches for gromacs 5.0 and 5.1 have been fixed so as to allow
+  patching in runtime mode.
 - Possibility to control manual generation (including pdf) from ./configure. Pdf manual is now off
   by default. Notice that on travis CI it is still generated.
 
@@ -127,6 +135,19 @@ For developers:
 - Fixed a bug in the interpretation of cmd strings. Namely, an erroneous string was not triggering an error.
   This is harmless for MD codes properly patched, but could have introduced problems in MD codes with typoes
   in cmd strings.
+- ./configure is not automatically relaunched anymore when doing `make clean`.
+
+Unreleased changes (will be included in 2.2.3)
+----------------------------------------------
+
+For users:
+- Updated patches for gromacs 5.1.x and 5.0.x to fix a problem when plumed was trying to write to an already
+  closed gromacs log file.
+- When looking for a value outside the GRID now the error include the name of the responsible 
+  collective variable
+- Numerical check in LatticeReduction made less picky.
+
+For developers:
 
 See branch \branch{v2.2} on git repository.
 
diff --git a/Makefile b/Makefile
index 3505a21a70afaaf90a4f07cc46fa4ccf75f8f3f9..4c2be18b962e61753df2b9dcd8f677a5104b9912 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
--include Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+ -include Makefile.conf
+endif
 
 
 SRCDIRS := src test
diff --git a/developer-doc/Makefile b/developer-doc/Makefile
index da31eaa8b19dd25e539fe596f2748fbece96c3cd..f83e0c889bce44e5c2855d0ef65b9bfb847f8971 100644
--- a/developer-doc/Makefile
+++ b/developer-doc/Makefile
@@ -1,5 +1,7 @@
 # include the machine dependent configuration
--include ../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../Makefile.conf
+endif
 
 .PHONY: all clean
 
diff --git a/patches/gromacs-5.0.4.config b/patches/gromacs-2016-beta1.config
similarity index 100%
rename from patches/gromacs-5.0.4.config
rename to patches/gromacs-2016-beta1.config
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f8a59296e79cbc4c9f1b47034389ac8c2730816
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt
@@ -0,0 +1,261 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
+
+set(LIBGROMACS_SOURCES)
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${INCL_INSTALL_DIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+if(GMX_USE_TNG)
+    option(GMX_EXTERNAL_TNG "Use external TNG instead of compiling the version shipped with GROMACS."
+           OFF)
+    # Detect TNG if GMX_EXTERNAL_TNG is explicitly ON
+    if(GMX_EXTERNAL_TNG)
+        find_package(TNG_IO 1.6.0)
+        if(NOT TNG_IO_FOUND)
+            message(FATAL_ERROR
+                "TNG >= 1.6.0 not found. "
+                "You can set GMX_EXTERNAL_TNG=OFF to compile TNG.")
+        endif()
+        include_directories(SYSTEM ${TNG_IO_INCLUDE_DIRS})
+    endif()
+    if(NOT GMX_EXTERNAL_TNG)
+        include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
+        tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
+        list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
+        tng_set_source_properties(WITH_ZLIB ${HAVE_ZLIB})
+
+        if (HAVE_ZLIB)
+            list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
+            include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS})
+        endif()
+    endif()
+else()
+    # We still need to get tng/tng_io_fwd.h from somewhere!
+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
+endif()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(tables)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(simd)
+add_subdirectory(imd)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.c)
+gmx_configure_version_file(
+    utility/baseversion-gen.c.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH SOURCE_FILE)
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+
+if (GMX_USE_CUDA)
+    cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
+if (HAS_NO_UNUSED_PARAMETER)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter")
+endif()
+check_cxx_compiler_flag(-Wno-deprecated-register HAS_NO_DEPRECATED_REGISTER)
+if (HAS_NO_DEPRECATED_REGISTER)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated-register")
+else()
+    check_cxx_compiler_flag(-Wno-deprecated HAS_NO_DEPRECATED)
+    if (HAS_NO_DEPRECATED)
+        set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated")
+    endif()
+endif()
+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
+
+target_link_libraries(libgromacs ${PLUMED_LOAD})
+
+target_link_libraries(libgromacs
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${TNG_IO_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${XML_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES})
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+            RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+            ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..edc051fe8328dcb0f2ff5604048c6e1a07bc66b1
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -0,0 +1,257 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+set(LIBGROMACS_SOURCES)
+
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+
+function (_gmx_add_files_to_property PROPERTY)
+    foreach (_file ${ARGN})
+        if (IS_ABSOLUTE "${_file}")
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
+        else()
+            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
+                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
+        endif()
+    endforeach()
+endfunction ()
+
+function (gmx_add_libgromacs_sources)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
+endfunction ()
+
+function (gmx_install_headers)
+    if (NOT GMX_BUILD_MDRUN_ONLY)
+        file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
+        install(FILES       ${ARGN}
+                DESTINATION "${INCL_INSTALL_DIR}/${_dest}"
+                COMPONENT   development)
+    endif()
+    _gmx_add_files_to_property(GMX_INSTALLED_HEADERS ${ARGN})
+endfunction ()
+
+function (gmx_write_installed_header_list)
+    get_property(_list GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+    string(REPLACE ";" "\n" _list "${_list}")
+    # TODO: Make this only update the file timestamp if the contents actually change.
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/installed-headers.txt "${_list}")
+endfunction()
+
+if(GMX_USE_TNG)
+    option(GMX_EXTERNAL_TNG "Use external TNG instead of compiling the version shipped with GROMACS."
+           OFF)
+    # Detect TNG if GMX_EXTERNAL_TNG is explicitly ON
+    if(GMX_EXTERNAL_TNG)
+        find_package(TNG_IO 1.6.0)
+        if(NOT TNG_IO_FOUND)
+            message(FATAL_ERROR
+                "TNG >= 1.6.0 not found. "
+                "You can set GMX_EXTERNAL_TNG=OFF to compile TNG.")
+        endif()
+        include_directories(SYSTEM ${TNG_IO_INCLUDE_DIRS})
+    endif()
+    if(NOT GMX_EXTERNAL_TNG)
+        include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
+        tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
+        list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
+        tng_set_source_properties(WITH_ZLIB ${HAVE_ZLIB})
+
+        if (HAVE_ZLIB)
+            list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
+            include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS})
+        endif()
+    endif()
+else()
+    # We still need to get tng/tng_io_fwd.h from somewhere!
+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
+endif()
+
+add_subdirectory(gmxlib)
+add_subdirectory(mdlib)
+add_subdirectory(listed-forces)
+add_subdirectory(commandline)
+add_subdirectory(domdec)
+add_subdirectory(ewald)
+add_subdirectory(fft)
+add_subdirectory(gpu_utils)
+add_subdirectory(hardware)
+add_subdirectory(linearalgebra)
+add_subdirectory(math)
+add_subdirectory(mdrunutility)
+add_subdirectory(mdtypes)
+add_subdirectory(onlinehelp)
+add_subdirectory(options)
+add_subdirectory(pbcutil)
+add_subdirectory(random)
+add_subdirectory(tables)
+add_subdirectory(timing)
+add_subdirectory(topology)
+add_subdirectory(trajectory)
+add_subdirectory(utility)
+add_subdirectory(fileio)
+add_subdirectory(swap)
+add_subdirectory(essentialdynamics)
+add_subdirectory(pulling)
+add_subdirectory(simd)
+add_subdirectory(imd)
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    add_subdirectory(gmxana)
+    add_subdirectory(gmxpreprocess)
+    add_subdirectory(correlationfunctions)
+    add_subdirectory(statistics)
+    add_subdirectory(analysisdata)
+    add_subdirectory(selection)
+    add_subdirectory(trajectoryanalysis)
+    add_subdirectory(tools)
+endif()
+
+get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
+
+# This would be the standard way to include thread_mpi, but
+# we want libgromacs to link the functions directly
+#if(GMX_THREAD_MPI)
+#    add_subdirectory(thread_mpi)
+#endif()
+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
+
+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
+
+configure_file(version.h.cmakein version.h)
+gmx_install_headers(
+    analysisdata.h
+    commandline.h
+    options.h
+    random.h
+    selection.h
+    trajectoryanalysis.h
+    utility.h
+    ${CMAKE_CURRENT_BINARY_DIR}/version.h
+    )
+
+# This code is here instead of utility/CMakeLists.txt, because CMake
+# custom commands and source file properties can only be set in the directory
+# that contains the target that uses them.
+# TODO: Generate a header instead that can be included from baseversion.c.
+# That probably simplifies things somewhat.
+set(GENERATED_VERSION_FILE utility/baseversion-gen.c)
+gmx_configure_version_file(
+    utility/baseversion-gen.c.cmakein ${GENERATED_VERSION_FILE}
+    REMOTE_HASH SOURCE_FILE)
+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+
+if (GMX_USE_CUDA)
+    cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+else()
+    add_library(libgromacs ${LIBGROMACS_SOURCES})
+endif()
+
+# Recent versions of gcc and clang give warnings on scanner.cpp, which
+# is a generated source file. These are awkward to suppress inline, so
+# we do it in the compilation command (after testing that the compiler
+# supports the suppressions).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
+if (HAS_NO_UNUSED_PARAMETER)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter")
+endif()
+check_cxx_compiler_flag(-Wno-deprecated-register HAS_NO_DEPRECATED_REGISTER)
+if (HAS_NO_DEPRECATED_REGISTER)
+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated-register")
+else()
+    check_cxx_compiler_flag(-Wno-deprecated HAS_NO_DEPRECATED)
+    if (HAS_NO_DEPRECATED)
+        set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated")
+    endif()
+endif()
+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
+
+target_link_libraries(libgromacs
+                      ${EXTRAE_LIBRARIES}
+                      ${GMX_EXTRA_LIBRARIES}
+                      ${TNG_IO_LIBRARIES}
+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
+                      ${XML_LIBRARIES}
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES}
+                      ${GMX_STDLIB_LIBRARIES})
+set_target_properties(libgromacs PROPERTIES
+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
+                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
+                      VERSION ${LIBRARY_VERSION}
+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+gmx_write_installed_header_list()
+
+# Only install the library in mdrun-only mode if it is actually necessary
+# for the binary
+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
+    install(TARGETS libgromacs
+            EXPORT libgromacs
+            LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+            RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+            ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+            COMPONENT libraries)
+endif()
+
+if (NOT GMX_BUILD_MDRUN_ONLY)
+    include(InstallLibInfo.cmake)
+endif()
+
+# Technically, the user could want to do this for an OpenCL build
+# using the CUDA runtime, but currently there's no reason to want to
+# do that.
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_USE_CUDA)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
+    endif()
+endif()
+
+if(GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bee45971a7e763451ebb77a43f40c0c2f2cb3eb
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp
@@ -0,0 +1,913 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/genborn.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+/* PLUMED */
+#include "../../../Plumed.h"
+int    plumedswitch=0;
+plumed plumedmain;
+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
+/* END PLUMED */
+
+
+void ns(FILE              *fp,
+        t_forcerec        *fr,
+        matrix             box,
+        gmx_groups_t      *groups,
+        gmx_localtop_t    *top,
+        t_mdatoms         *md,
+        t_commrec         *cr,
+        t_nrnb            *nrnb,
+        gmx_bool           bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void reduce_thread_energies(tensor vir_q, tensor vir_lj,
+                                   real *Vcorr_q, real *Vcorr_lj,
+                                   real *dvdl_q, real *dvdl_lj,
+                                   int nthreads,
+                                   ewald_corr_thread_t *ewc_t)
+{
+    int t;
+
+    for (t = 1; t < nthreads; t++)
+    {
+        *Vcorr_q  += ewc_t[t].Vcorr_q;
+        *Vcorr_lj += ewc_t[t].Vcorr_lj;
+        *dvdl_q   += ewc_t[t].dvdl[efptCOUL];
+        *dvdl_lj  += ewc_t[t].dvdl[efptVDW];
+        m_add(vir_q, ewc_t[t].vir_q, vir_q);
+        m_add(vir_lj, ewc_t[t].vir_lj, vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
+                       t_idef     *idef,    t_commrec  *cr,
+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
+                       t_mdatoms  *md,
+                       rvec       x[],      history_t  *hist,
+                       rvec       f[],
+                       gmx_enerdata_t *enerd,
+                       t_fcdata   *fcd,
+                       gmx_localtop_t *top,
+                       gmx_genborn_t *born,
+                       gmx_bool       bBornRadii,
+                       matrix     box,
+                       t_lambda   *fepvals,
+                       real       *lambda,
+                       t_graph    *graph,
+                       t_blocka   *excl,
+                       rvec       mu_tot[],
+                       int        flags,
+                       float      *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    gmx_bool    bSB;
+    int         pme_flags;
+    matrix      boxs;
+    rvec        box_size;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* Reset box */
+    for (i = 0; (i < DIM); i++)
+    {
+        box_size[i] = box[i][i];
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* If doing GB, reset dvda and calculate the Born radii */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+
+        for (i = 0; i < born->nr; i++)
+        {
+            fr->dvda[i] = 0;
+        }
+
+        if (bBornRadii)
+        {
+            calc_gb_rad(cr, fr, ir, top, x, fr->gblist, born, md, nrnb);
+        }
+
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    where();
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, f, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, f, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+        where();
+    }
+
+    /* If we are doing GB, calculate bonded forces and apply corrections
+     * to the solvation forces */
+    /* MRS: Eventually, many need to include free energy contribution here! */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsLISTED);
+        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
+        wallcycle_sub_stop(wcycle, ewcsLISTED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr->ms,
+                    idef, (const rvec *) x, hist, f, fr,
+                    &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
+                    flags);
+
+    where();
+
+    *cycles_pme = 0;
+    clear_mat(fr->vir_el_recip);
+    clear_mat(fr->vir_lj_recip);
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
+        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
+
+        bSB = (ir->nwall == 2);
+        if (bSB)
+        {
+            copy_mat(box, boxs);
+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+            box_size[ZZ] *= ir->wall_ewald_zfac;
+        }
+
+        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            real dvdl_long_range_correction_q   = 0;
+            real dvdl_long_range_correction_lj  = 0;
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        tensor *vir_q, *vir_lj;
+                        real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
+                        if (t == 0)
+                        {
+                            vir_q     = &fr->vir_el_recip;
+                            vir_lj    = &fr->vir_lj_recip;
+                            Vcorrt_q  = &Vcorr_q;
+                            Vcorrt_lj = &Vcorr_lj;
+                            dvdlt_q   = &dvdl_long_range_correction_q;
+                            dvdlt_lj  = &dvdl_long_range_correction_lj;
+                        }
+                        else
+                        {
+                            vir_q     = &fr->ewc_t[t].vir_q;
+                            vir_lj    = &fr->ewc_t[t].vir_lj;
+                            Vcorrt_q  = &fr->ewc_t[t].Vcorr_q;
+                            Vcorrt_lj = &fr->ewc_t[t].Vcorr_lj;
+                            dvdlt_q   = &fr->ewc_t[t].dvdl[efptCOUL];
+                            dvdlt_lj  = &fr->ewc_t[t].dvdl[efptVDW];
+                            clear_mat(*vir_q);
+                            clear_mat(*vir_lj);
+                        }
+                        *dvdlt_q  = 0;
+                        *dvdlt_lj = 0;
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single fr->f_novirsum array.
+                         */
+                        ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
+                                           cr, t, fr,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           md->nChargePerturbed || md->nTypePerturbed,
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, bSB ? boxs : box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           fr->f_novirsum, *vir_q, *vir_lj,
+                                           Vcorrt_q, Vcorrt_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           dvdlt_q, dvdlt_lj);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduce_thread_energies(fr->vir_el_recip, fr->vir_lj_recip,
+                                           &Vcorr_q, &Vcorr_lj,
+                                           &dvdl_long_range_correction_q,
+                                           &dvdl_long_range_correction_lj,
+                                           nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                                   &dvdl_long_range_correction_q,
+                                                   fr->vir_el_recip);
+            }
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
+            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
+
+            if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)) && (cr->duty & DUTY_PME))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+                    if (EEL_PME(fr->eeltype))
+                    {
+                        pme_flags     |= GMX_PME_DO_COULOMB;
+                    }
+                    if (EVDW_PME(fr->vdwtype))
+                    {
+                        pme_flags |= GMX_PME_DO_LJ;
+                    }
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x, fr->f_novirsum,
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        bSB ? boxs : box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        fr->vir_el_recip, fr->ewaldcoeff_q,
+                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
+                             md->chargeA, md->chargeB,
+                             box_size, cr, md->homenr,
+                             fr->vir_el_recip, fr->ewaldcoeff_q,
+                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
+        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
+        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, x, f,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+    where();
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      int plumedNeedsEnergy;
+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
+    }
+    /* END PLUMED */
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
+{
+    int    i, j, index;
+    double dlam;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    /* Notes on the foreign lambda free energy difference evaluation:
+     * Adding the potential and ekin terms that depend linearly on lambda
+     * as delta lam * dvdl to the energy differences is exact.
+     * For the constraints this is not exact, but we have no other option
+     * without literally changing the lengths and reevaluating the energies at each step.
+     * (try to remedy this post 4.6 - MRS)
+     */
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+    enerd->term[F_DVDL_CONSTR] = 0;
+
+    for (i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        for (j = 0; j < efptNR; j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..107e00a8e7b3393ec0eaf42a07f584d29c6ea4eb
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/force.cpp.preplumed
@@ -0,0 +1,899 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "force.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vecdump.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/genborn.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+void ns(FILE              *fp,
+        t_forcerec        *fr,
+        matrix             box,
+        gmx_groups_t      *groups,
+        gmx_localtop_t    *top,
+        t_mdatoms         *md,
+        t_commrec         *cr,
+        t_nrnb            *nrnb,
+        gmx_bool           bFillGrid)
+{
+    int     nsearch;
+
+
+    if (!fr->ns->nblist_initialized)
+    {
+        init_neighbor_list(fp, fr, md->homenr);
+    }
+
+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
+                                bFillGrid);
+    if (debug)
+    {
+        fprintf(debug, "nsearch = %d\n", nsearch);
+    }
+
+    /* Check whether we have to do dynamic load balancing */
+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
+       &(top->idef),opts->ngener);
+     */
+    if (fr->ns->dump_nl > 0)
+    {
+        dump_nblist(fp, cr, fr, fr->ns->dump_nl);
+    }
+}
+
+static void reduce_thread_energies(tensor vir_q, tensor vir_lj,
+                                   real *Vcorr_q, real *Vcorr_lj,
+                                   real *dvdl_q, real *dvdl_lj,
+                                   int nthreads,
+                                   ewald_corr_thread_t *ewc_t)
+{
+    int t;
+
+    for (t = 1; t < nthreads; t++)
+    {
+        *Vcorr_q  += ewc_t[t].Vcorr_q;
+        *Vcorr_lj += ewc_t[t].Vcorr_lj;
+        *dvdl_q   += ewc_t[t].dvdl[efptCOUL];
+        *dvdl_lj  += ewc_t[t].dvdl[efptVDW];
+        m_add(vir_q, ewc_t[t].vir_q, vir_q);
+        m_add(vir_lj, ewc_t[t].vir_lj, vir_lj);
+    }
+}
+
+void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
+                       t_idef     *idef,    t_commrec  *cr,
+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
+                       t_mdatoms  *md,
+                       rvec       x[],      history_t  *hist,
+                       rvec       f[],
+                       gmx_enerdata_t *enerd,
+                       t_fcdata   *fcd,
+                       gmx_localtop_t *top,
+                       gmx_genborn_t *born,
+                       gmx_bool       bBornRadii,
+                       matrix     box,
+                       t_lambda   *fepvals,
+                       real       *lambda,
+                       t_graph    *graph,
+                       t_blocka   *excl,
+                       rvec       mu_tot[],
+                       int        flags,
+                       float      *cycles_pme)
+{
+    int         i, j;
+    int         donb_flags;
+    gmx_bool    bSB;
+    int         pme_flags;
+    matrix      boxs;
+    rvec        box_size;
+    t_pbc       pbc;
+    real        dvdl_dum[efptNR], dvdl_nb[efptNR];
+
+#if GMX_MPI
+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
+#endif
+
+    set_pbc(&pbc, fr->ePBC, box);
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+        dvdl_dum[i] = 0;
+    }
+
+    /* Reset box */
+    for (i = 0; (i < DIM); i++)
+    {
+        box_size[i] = box[i][i];
+    }
+
+    /* do QMMM first if requested */
+    if (fr->bQMMM)
+    {
+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
+    }
+
+    /* Call the short range functions all in one go. */
+
+#if GMX_MPI
+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
+#define TAKETIME FALSE
+    if (TAKETIME)
+    {
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t0 = MPI_Wtime();
+    }
+#endif
+
+    if (ir->nwall)
+    {
+        /* foreign lambda component for walls */
+        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
+                                   enerd->grpp.ener[egLJSR], nrnb);
+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+    }
+
+    /* If doing GB, reset dvda and calculate the Born radii */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+
+        for (i = 0; i < born->nr; i++)
+        {
+            fr->dvda[i] = 0;
+        }
+
+        if (bBornRadii)
+        {
+            calc_gb_rad(cr, fr, ir, top, x, fr->gblist, born, md, nrnb);
+        }
+
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    where();
+    /* We only do non-bonded calculation with group scheme here, the verlet
+     * calls are done from do_force_cutsVERLET(). */
+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
+    {
+        donb_flags = 0;
+        /* Add short-range interactions */
+        donb_flags |= GMX_NONBONDED_DO_SR;
+
+        /* Currently all group scheme kernels always calculate (shift-)forces */
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_NONBONDED_DO_FORCE;
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+        }
+        if (flags & GMX_FORCE_ENERGY)
+        {
+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(fr, x, f, md, excl,
+                     &enerd->grpp, nrnb,
+                     lambda, dvdl_nb, -1, -1, donb_flags);
+
+        /* If we do foreign lambda and we have soft-core interactions
+         * we have to recalculate the (non-linear) energies contributions.
+         */
+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+        {
+            for (i = 0; i < enerd->n_lambda; i++)
+            {
+                real lam_i[efptNR];
+
+                for (j = 0; j < efptNR; j++)
+                {
+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+                }
+                reset_foreign_enerdata(enerd);
+                do_nonbonded(fr, x, f, md, excl,
+                             &(enerd->foreign_grpp), nrnb,
+                             lam_i, dvdl_dum, -1, -1,
+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+            }
+        }
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+        where();
+    }
+
+    /* If we are doing GB, calculate bonded forces and apply corrections
+     * to the solvation forces */
+    /* MRS: Eventually, many need to include free energy contribution here! */
+    if (ir->implicit_solvent)
+    {
+        wallcycle_sub_start(wcycle, ewcsLISTED);
+        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
+        wallcycle_sub_stop(wcycle, ewcsLISTED);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t1          = MPI_Wtime();
+        fr->t_fnbf += t1-t0;
+    }
+#endif
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
+    }
+
+    if (fepvals->sc_alpha != 0)
+
+    /* even though coulomb part is linear, we already added it, beacuse we
+       need to go through the vdw calculation anyway */
+    {
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
+    }
+
+    /* Shift the coordinates. Must be done before listed forces and PPPM,
+     * but is also necessary for SHAKE and update, therefore it can NOT
+     * go when no listed forces have to be evaluated.
+     *
+     * The shifting and PBC code is deliberately not timed, since with
+     * the Verlet scheme it only takes non-zero time with triclinic
+     * boxes, and even then the time is around a factor of 100 less
+     * than the next smallest counter.
+     */
+
+
+    /* Here sometimes we would not need to shift with NBFonly,
+     * but we do so anyhow for consistency of the returned coordinates.
+     */
+    if (graph)
+    {
+        shift_self(graph, box, x);
+        if (TRICLINIC(box))
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+        }
+        else
+        {
+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+        }
+    }
+    /* Check whether we need to do listed interactions or correct for exclusions */
+    if (fr->bMolPBC &&
+        ((flags & GMX_FORCE_LISTED)
+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
+    {
+        /* TODO There are no electrostatics methods that require this
+           transformation, when using the Verlet scheme, so update the
+           above conditional. */
+        /* Since all atoms are in the rectangular or triclinic unit-cell,
+         * only single box vector shifts (2 in x) are required.
+         */
+        set_pbc_dd(&pbc, fr->ePBC, DOMAINDECOMP(cr) ? cr->dd->nc : nullptr,
+                   TRUE, box);
+    }
+
+    do_force_listed(wcycle, box, ir->fepvals, cr->ms,
+                    idef, (const rvec *) x, hist, f, fr,
+                    &pbc, graph, enerd, nrnb, lambda, md, fcd,
+                    DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
+                    flags);
+
+    where();
+
+    *cycles_pme = 0;
+    clear_mat(fr->vir_el_recip);
+    clear_mat(fr->vir_lj_recip);
+
+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+     * corrections.
+     */
+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
+    {
+        int  status            = 0;
+        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
+        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
+
+        bSB = (ir->nwall == 2);
+        if (bSB)
+        {
+            copy_mat(box, boxs);
+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+            box_size[ZZ] *= ir->wall_ewald_zfac;
+        }
+
+        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            real dvdl_long_range_correction_q   = 0;
+            real dvdl_long_range_correction_lj  = 0;
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid
+             * contributions will be calculated in
+             * gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
+            {
+                int nthreads, t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = fr->nthread_ewc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (t = 0; t < nthreads; t++)
+                {
+                    try
+                    {
+                        tensor *vir_q, *vir_lj;
+                        real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
+                        if (t == 0)
+                        {
+                            vir_q     = &fr->vir_el_recip;
+                            vir_lj    = &fr->vir_lj_recip;
+                            Vcorrt_q  = &Vcorr_q;
+                            Vcorrt_lj = &Vcorr_lj;
+                            dvdlt_q   = &dvdl_long_range_correction_q;
+                            dvdlt_lj  = &dvdl_long_range_correction_lj;
+                        }
+                        else
+                        {
+                            vir_q     = &fr->ewc_t[t].vir_q;
+                            vir_lj    = &fr->ewc_t[t].vir_lj;
+                            Vcorrt_q  = &fr->ewc_t[t].Vcorr_q;
+                            Vcorrt_lj = &fr->ewc_t[t].Vcorr_lj;
+                            dvdlt_q   = &fr->ewc_t[t].dvdl[efptCOUL];
+                            dvdlt_lj  = &fr->ewc_t[t].dvdl[efptVDW];
+                            clear_mat(*vir_q);
+                            clear_mat(*vir_lj);
+                        }
+                        *dvdlt_q  = 0;
+                        *dvdlt_lj = 0;
+
+                        /* Threading is only supported with the Verlet cut-off
+                         * scheme and then only single particle forces (no
+                         * exclusion forces) are calculated, so we can store
+                         * the forces in the normal, single fr->f_novirsum array.
+                         */
+                        ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
+                                           cr, t, fr,
+                                           md->chargeA, md->chargeB,
+                                           md->sqrt_c6A, md->sqrt_c6B,
+                                           md->sigmaA, md->sigmaB,
+                                           md->sigma3A, md->sigma3B,
+                                           md->nChargePerturbed || md->nTypePerturbed,
+                                           ir->cutoff_scheme != ecutsVERLET,
+                                           excl, x, bSB ? boxs : box, mu_tot,
+                                           ir->ewald_geometry,
+                                           ir->epsilon_surface,
+                                           fr->f_novirsum, *vir_q, *vir_lj,
+                                           Vcorrt_q, Vcorrt_lj,
+                                           lambda[efptCOUL], lambda[efptVDW],
+                                           dvdlt_q, dvdlt_lj);
+                    }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+                }
+                if (nthreads > 1)
+                {
+                    reduce_thread_energies(fr->vir_el_recip, fr->vir_lj_recip,
+                                           &Vcorr_q, &Vcorr_lj,
+                                           &dvdl_long_range_correction_q,
+                                           &dvdl_long_range_correction_lj,
+                                           nthreads, fr->ewc_t);
+                }
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
+            {
+                /* This is not in a subcounter because it takes a
+                   negligible and constant-sized amount of time */
+                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
+                                                   &dvdl_long_range_correction_q,
+                                                   fr->vir_el_recip);
+            }
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
+            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
+
+            if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)) && (cr->duty & DUTY_PME))
+            {
+                /* Do reciprocal PME for Coulomb and/or LJ. */
+                assert(fr->n_tpi >= 0);
+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
+                {
+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+                    if (EEL_PME(fr->eeltype))
+                    {
+                        pme_flags     |= GMX_PME_DO_COULOMB;
+                    }
+                    if (EVDW_PME(fr->vdwtype))
+                    {
+                        pme_flags |= GMX_PME_DO_LJ;
+                    }
+                    if (flags & GMX_FORCE_FORCES)
+                    {
+                        pme_flags |= GMX_PME_CALC_F;
+                    }
+                    if (flags & GMX_FORCE_VIRIAL)
+                    {
+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
+                    }
+                    if (fr->n_tpi > 0)
+                    {
+                        /* We don't calculate f, but we do want the potential */
+                        pme_flags |= GMX_PME_CALC_POT;
+                    }
+                    wallcycle_start(wcycle, ewcPMEMESH);
+                    status = gmx_pme_do(fr->pmedata,
+                                        0, md->homenr - fr->n_tpi,
+                                        x, fr->f_novirsum,
+                                        md->chargeA, md->chargeB,
+                                        md->sqrt_c6A, md->sqrt_c6B,
+                                        md->sigmaA, md->sigmaB,
+                                        bSB ? boxs : box, cr,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
+                                        nrnb, wcycle,
+                                        fr->vir_el_recip, fr->ewaldcoeff_q,
+                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
+                                        &Vlr_q, &Vlr_lj,
+                                        lambda[efptCOUL], lambda[efptVDW],
+                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
+                    if (status != 0)
+                    {
+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
+                    }
+                    /* We should try to do as little computation after
+                     * this as possible, because parallel PME synchronizes
+                     * the nodes, so we want all load imbalance of the
+                     * rest of the force calculation to be before the PME
+                     * call.  DD load balancing is done on the whole time
+                     * of the force call (without PME).
+                     */
+                }
+                if (fr->n_tpi > 0)
+                {
+                    if (EVDW_PME(ir->vdwtype))
+                    {
+
+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
+                    }
+                    /* Determine the PME grid energy of the test molecule
+                     * with the PME grid potential of the other charges.
+                     */
+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
+                                        x + md->homenr - fr->n_tpi,
+                                        md->chargeA + md->homenr - fr->n_tpi,
+                                        &Vlr_q);
+                }
+            }
+        }
+
+        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
+        {
+            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
+                             md->chargeA, md->chargeB,
+                             box_size, cr, md->homenr,
+                             fr->vir_el_recip, fr->ewaldcoeff_q,
+                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
+        }
+
+        /* Note that with separate PME nodes we get the real energies later */
+        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
+        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
+        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
+        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
+        if (debug)
+        {
+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
+                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
+                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
+            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
+        }
+    }
+    else
+    {
+        /* Is there a reaction-field exclusion correction needed?
+         * With the Verlet scheme, exclusion forces are calculated
+         * in the non-bonded kernel.
+         */
+        if (ir->cutoff_scheme != ecutsVERLET && EEL_RF(fr->eeltype))
+        {
+            real dvdl_rf_excl      = 0;
+            enerd->term[F_RF_EXCL] =
+                RF_excl_correction(fr, graph, md, excl, x, f,
+                                   fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
+
+            enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
+        }
+    }
+    where();
+
+    if (debug)
+    {
+        print_nrnb(debug, nrnb);
+    }
+
+#if GMX_MPI
+    if (TAKETIME)
+    {
+        t2 = MPI_Wtime();
+        MPI_Barrier(cr->mpi_comm_mygroup);
+        t3          = MPI_Wtime();
+        fr->t_wait += t3-t2;
+        if (fr->timesteps == 11)
+        {
+            char buf[22];
+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
+        }
+        fr->timesteps++;
+    }
+#endif
+
+    if (debug)
+    {
+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+    }
+
+}
+
+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
+{
+    int i, n2;
+
+    for (i = 0; i < F_NRE; i++)
+    {
+        enerd->term[i]         = 0;
+        enerd->foreign_term[i] = 0;
+    }
+
+
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]     = 0;
+        enerd->dvdl_nonlin[i]  = 0;
+    }
+
+    n2 = ngener*ngener;
+    if (debug)
+    {
+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
+    }
+    enerd->grpp.nener         = n2;
+    enerd->foreign_grpp.nener = n2;
+    for (i = 0; (i < egNR); i++)
+    {
+        snew(enerd->grpp.ener[i], n2);
+        snew(enerd->foreign_grpp.ener[i], n2);
+    }
+
+    if (n_lambda)
+    {
+        enerd->n_lambda = 1 + n_lambda;
+        snew(enerd->enerpart_lambda, enerd->n_lambda);
+    }
+    else
+    {
+        enerd->n_lambda = 0;
+    }
+}
+
+void destroy_enerdata(gmx_enerdata_t *enerd)
+{
+    int i;
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->grpp.ener[i]);
+    }
+
+    for (i = 0; (i < egNR); i++)
+    {
+        sfree(enerd->foreign_grpp.ener[i]);
+    }
+
+    if (enerd->n_lambda)
+    {
+        sfree(enerd->enerpart_lambda);
+    }
+}
+
+static real sum_v(int n, real v[])
+{
+    real t;
+    int  i;
+
+    t = 0.0;
+    for (i = 0; (i < n); i++)
+    {
+        t = t + v[i];
+    }
+
+    return t;
+}
+
+void sum_epot(gmx_grppairener_t *grpp, real *epot)
+{
+    int i;
+
+    /* Accumulate energies */
+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
+
+/* lattice part of LR doesnt belong to any group
+ * and has been added earlier
+ */
+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
+
+    epot[F_EPOT] = 0;
+    for (i = 0; (i < F_EPOT); i++)
+    {
+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
+        {
+            epot[F_EPOT] += epot[i];
+        }
+    }
+}
+
+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
+{
+    int    i, j, index;
+    double dlam;
+
+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
+    enerd->term[F_DVDL]       = 0.0;
+    for (i = 0; i < efptNR; i++)
+    {
+        if (fepvals->separate_dvdl[i])
+        {
+            /* could this be done more readably/compactly? */
+            switch (i)
+            {
+                case (efptMASS):
+                    index = F_DKDL;
+                    break;
+                case (efptCOUL):
+                    index = F_DVDL_COUL;
+                    break;
+                case (efptVDW):
+                    index = F_DVDL_VDW;
+                    break;
+                case (efptBONDED):
+                    index = F_DVDL_BONDED;
+                    break;
+                case (efptRESTRAINT):
+                    index = F_DVDL_RESTRAINT;
+                    break;
+                default:
+                    index = F_DVDL;
+                    break;
+            }
+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+        else
+        {
+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
+            if (debug)
+            {
+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
+            }
+        }
+    }
+
+    /* Notes on the foreign lambda free energy difference evaluation:
+     * Adding the potential and ekin terms that depend linearly on lambda
+     * as delta lam * dvdl to the energy differences is exact.
+     * For the constraints this is not exact, but we have no other option
+     * without literally changing the lengths and reevaluating the energies at each step.
+     * (try to remedy this post 4.6 - MRS)
+     */
+    if (fepvals->separate_dvdl[efptBONDED])
+    {
+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    }
+    else
+    {
+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
+    }
+    enerd->term[F_DVDL_CONSTR] = 0;
+
+    for (i = 0; i < fepvals->n_lambda; i++)
+    {
+        /* note we are iterating over fepvals here!
+           For the current lam, dlam = 0 automatically,
+           so we don't need to add anything to the
+           enerd->enerpart_lambda[0] */
+
+        /* we don't need to worry about dvdl_lin contributions to dE at
+           current lambda, because the contributions to the current
+           lambda are automatically zeroed */
+
+        for (j = 0; j < efptNR; j++)
+        {
+            /* Note that this loop is over all dhdl components, not just the separated ones */
+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
+            if (debug)
+            {
+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
+                        fepvals->all_lambda[j][i], efpt_names[j],
+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
+                        dlam, enerd->dvdl_lin[j]);
+            }
+        }
+    }
+}
+
+
+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
+{
+    int  i, j;
+
+    /* First reset all foreign energy components.  Foreign energies always called on
+       neighbor search steps */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->foreign_grpp.ener[i][j] = 0.0;
+        }
+    }
+
+    /* potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->foreign_term[i] = 0.0;
+    }
+}
+
+void reset_enerdata(gmx_enerdata_t *enerd)
+{
+    int      i, j;
+
+    /* First reset all energy components. */
+    for (i = 0; (i < egNR); i++)
+    {
+        for (j = 0; (j < enerd->grpp.nener); j++)
+        {
+            enerd->grpp.ener[i][j] = 0.0;
+        }
+    }
+    for (i = 0; i < efptNR; i++)
+    {
+        enerd->dvdl_lin[i]    = 0.0;
+        enerd->dvdl_nonlin[i] = 0.0;
+    }
+
+    /* Normal potential energy components */
+    for (i = 0; (i <= F_EPOT); i++)
+    {
+        enerd->term[i] = 0.0;
+    }
+    enerd->term[F_DVDL]            = 0.0;
+    enerd->term[F_DVDL_COUL]       = 0.0;
+    enerd->term[F_DVDL_VDW]        = 0.0;
+    enerd->term[F_DVDL_BONDED]     = 0.0;
+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
+    enerd->term[F_DKDL]            = 0.0;
+    if (enerd->n_lambda > 0)
+    {
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            enerd->enerpart_lambda[i] = 0.0;
+        }
+    }
+    /* reset foreign energy data - separate function since we also call it elsewhere */
+    reset_foreign_enerdata(enerd);
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ce7832812fc7b72d446ab0c9985e23832d66fda
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp
@@ -0,0 +1,3128 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "minimize.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state  s;
+    //! Force array
+    rvec    *f;
+    //! Potential energy
+    real     epot;
+    //! Norm of the force
+    real     fnorm;
+    //! Maximum force
+    real     fmax;
+    //! Direction
+    int      a_fmax;
+} em_state_t;
+
+//! Initiate em_state_t structure and return pointer to it
+static em_state_t *init_em_state()
+{
+    em_state_t *ems;
+
+    snew(ems, 1);
+
+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
+    snew(ems->s.lambda, efptNR);
+
+    return ems;
+}
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           t_commrec                *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
+{
+    char buffer[2048];
+    if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                sizeof(real) < sizeof(double) ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
+                            real epot, real fmax, int nfmax, real fnorm)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->gatindex[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = (int)(sum[2*i+1] + 0.5);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+void init_em(FILE *fplog, const char *title,
+             t_commrec *cr, t_inputrec *ir,
+             t_state *state_global, gmx_mtop_t *top_global,
+             em_state_t *ems, gmx_localtop_t **top,
+             rvec **f,
+             t_nrnb *nrnb, rvec mu_tot,
+             t_forcerec *fr, gmx_enerdata_t **enerd,
+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int nfile, const t_filenm fnm[],
+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
+             int imdport, unsigned long gmx_unused Flags,
+             gmx_wallcycle_t wcycle)
+{
+    int  i;
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    state_global->ngtc = 0;
+
+    /* Initialize lambda variables */
+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, top_global, fplog, 1, state_global->x,
+             nfile, fnm, NULL, imdport, Flags);
+
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        *f = NULL;
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdatoms, *top,
+                            fr, vsite, constr,
+                            nrnb, NULL, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = NULL;
+    }
+    else
+    {
+        snew(*f, top_global->natoms);
+
+        /* Just copy the state */
+        ems->s = *state_global;
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        snew(ems->s.x, ems->s.nalloc + 1);
+        snew(ems->f, ems->s.nalloc+1);
+        snew(ems->s.v, ems->s.nalloc+1);
+        for (i = 0; i < state_global->natoms; i++)
+        {
+            copy_rvec(state_global->x[i], ems->s.x[i]);
+        }
+        copy_mat(state_global->box, ems->s.box);
+
+        *top      = gmx_mtop_generate_local_top(top_global, ir->efep != efepNO);
+
+        forcerec_set_excl_load(fr, *top);
+
+        setup_bonded_threading(fr, &(*top)->idef);
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+        else
+        {
+            *graph = NULL;
+        }
+
+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms, cr);
+        }
+    }
+
+    if (constr)
+    {
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, *top, ir, mdatoms, cr);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
+                      ir, cr, -1, 0, 1.0, mdatoms,
+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
+                      ems->s.lambda[efptFEP], &dvdl_constr,
+                      NULL, NULL, nrnb, econqCoord);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = NULL;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != NULL)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, NULL);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+
+    /* PLUMED */
+    if(plumedswitch){
+      if(cr->ms && cr->ms->nsim>1) {
+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        (*plumedcmd) (plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }else{
+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
+        }
+      }
+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
+      (*plumedcmd) (plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
+      (*plumedcmd) (plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+        }
+      }
+    }
+    /* END PLUMED */
+}
+
+//! Finalize the minimization
+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
+{
+    em_state_t tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Copy coordinate from an EM state to a "normal" state structure
+static void copy_em_coords(em_state_t *ems, t_state *state)
+{
+    int i;
+
+    for (i = 0; (i < state->natoms); i++)
+    {
+        copy_rvec(ems->s.x[i], state->x[i]);
+    }
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, gmx_int64_t step,
+                          em_state_t *state,
+                          t_state *state_global)
+{
+    int      mdof_flags;
+    gmx_bool bIMDout = FALSE;
+
+
+    /* Shall we do IMD output? */
+    if (ir->bIMD)
+    {
+        bIMDout = do_per_step(step, IMD_get_step(ir->imd->setup));
+    }
+
+    if ((bX || bF || bIMDout || confout != NULL) && !DOMAINDECOMP(cr))
+    {
+        copy_em_coords(state, state_global);
+    }
+
+    mdof_flags = 0;
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, (double)step,
+                                     &state->s, state_global, state->f);
+
+    if (confout != NULL && MASTER(cr))
+    {
+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+        {
+            /* Make molecules whole only for confout writing */
+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
+                        state_global->x);
+        }
+
+        write_sto_conf_mtop(confout,
+                            *top_global->name, top_global,
+                            state_global->x, NULL, ir->ePBC, state_global->box);
+    }
+}
+
+//! Do one minimization step
+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
+                       gmx_bool bMolPBC,
+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
+                       gmx_constr_t constr, gmx_localtop_t *top,
+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                       gmx_int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      i;
+    int      start, end;
+    rvec    *x1, *x2;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->nalloc != s1->nalloc)
+    {
+        s2->nalloc = s1->nalloc;
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        srenew(s2->x, s1->nalloc + 1);
+        srenew(ems2->f,  s1->nalloc);
+        if (s2->flags & (1<<estCGP))
+        {
+            srenew(s2->cg_p,  s1->nalloc + 1);
+        }
+    }
+
+    s2->natoms = s1->natoms;
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    for (i = 0; i < efptNR; i++)
+    {
+        s2->lambda[i] = s1->lambda[i];
+    }
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    x1 = s1->x;
+    x2 = s2->x;
+
+    // cppcheck-suppress unreadVariable
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        int gf, i, m;
+
+        gf = 0;
+#pragma omp for schedule(static) nowait
+        for (i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            x1 = s1->cg_p;
+            x2 = s2->cg_p;
+#pragma omp for schedule(static) nowait
+            for (i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(x1[i], x2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
+            {
+#pragma omp barrier
+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
+                try
+                {
+                    /* We need to allocate one element extra, since we might use
+                     * (unaligned) 4-wide SIMD loads to access rvec entries.
+                     */
+                    srenew(s2->cg_gl, s2->cg_gl_nalloc + 1);
+                }
+                GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+#pragma omp barrier
+            }
+            s2->ncg_gl = s1->ncg_gl;
+#pragma omp for schedule(static) nowait
+            for (i = 0; i < s2->ncg_gl; i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
+                  ir, cr, count, 0, 1.0, md,
+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
+                  s2->lambda[efptBONDED], &dvdl_constr,
+                  NULL, NULL, nrnb, econqCoord);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   t_mdatoms *mdatoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, step, cr, FALSE, 1,
+                        NULL, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdatoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+//! De one energy evaluation
+static void evaluate_energy(FILE *fplog, t_commrec *cr,
+                            gmx_mtop_t *top_global,
+                            em_state_t *ems, gmx_localtop_t *top,
+                            t_inputrec *inputrec,
+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                            gmx_global_stat_t gstat,
+                            gmx_vsite_t *vsite, gmx_constr_t constr,
+                            t_fcdata *fcd,
+                            t_graph *graph, t_mdatoms *mdatoms,
+                            t_forcerec *fr, rvec mu_tot,
+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
+                            gmx_int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x, 1, NULL,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                               ems, top, mdatoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    matrix plumed_vir;
+    if(plumedswitch){
+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&lstep);
+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
+      (*plumedcmd) (plumedmain,"setMasses",&mdatoms->massT[0]);
+      (*plumedcmd) (plumedmain,"setCharges",&mdatoms->chargeA[0]);
+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+      clear_mat(plumed_vir);
+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
+    }
+    /* END PLUMED */
+
+    do_force(fplog, cr, inputrec,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x, &ems->s.hist,
+             ems->f, force_vir, mdatoms, enerd, fcd,
+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0));
+    /* PLUMED */
+    if(plumedswitch){
+      if(plumedNeedsEnergy) {
+        msmul(force_vir,2.0,plumed_vir);
+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+        (*plumedcmd) (plumedmain,"performCalc",NULL);
+        msmul(plumed_vir,0.5,force_vir);
+      } else {
+        msmul(plumed_vir,0.5,plumed_vir);
+        m_add(force_vir,plumed_vir,force_vir);
+      }
+    }
+    /* END PLUMED */
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, NULL, NULL, NULL, 1, &terminate,
+                    NULL, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
+                  inputrec, cr, count, 0, 1.0, mdatoms,
+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
+                  ems->s.lambda[efptBONDED], &dvdl_constr,
+                  NULL, &shake_vir, nrnb, econqForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
+    }
+}
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    rvec          *fm, *fb, *fmg;
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    fm = s_min->f;
+    fb = s_b->f;
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.ncg_gl;
+    cg_gl = s_min->s.cg_gl;
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.ncg_gl;
+    cg_gl       = s_b->s.cg_gl;
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    rvec  *fm, *fb;
+    double sum;
+    int    gf, i, m;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        fm  = s_min->f;
+        fb  = s_b->f;
+        sum = 0;
+        gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+/*! \brief Do conjugate gradients minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_cg(FILE *fplog, t_commrec *cr,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+             int gmx_unused nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int gmx_unused stepout,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t gmx_unused ed,
+             t_forcerec *fr,
+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+             real gmx_unused cpt_period, real gmx_unused max_hours,
+             int imdport,
+             unsigned long gmx_unused Flags,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    em_state_t       *s_min, *s_a, *s_b, *s_c;
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    rvec             *f;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    rvec             *p, *sf;
+    double            gpa, gpb, gpc, tmp, minstep;
+    real              fnormn;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               i, m, gf, step, nminstep;
+
+    step = 0;
+
+    s_min = init_em_state();
+    s_a   = init_em_state();
+    s_b   = init_em_state();
+    s_c   = init_em_state();
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, CG, cr, inputrec,
+            state_global, top_global, s_min, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    evaluate_energy(fplog, cr,
+                    top_global, s_min, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+    where();
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        p   = s_min->s.cg_p;
+        sf  = s_min->f;
+        gpa = 0;
+        gf  = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    p[i][m] = sf[i][m] + beta*p[i][m];
+                    gpa    -= p[i][m]*sf[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    p[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min->s.x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = p[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
+                      top_global, inputrec, step,
+                      s_min, state_global);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
+                                   s_min, top, mdatoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
+                   constr, top, nrnb, wcycle, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        evaluate_energy(fplog, cr,
+                        top_global, s_c, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        p   = s_c->s.cg_p;
+        sf  = s_c->f;
+        gpc = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdatoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
+                           constr, top, nrnb, wcycle, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                evaluate_energy(fplog, cr,
+                                top_global, s_b, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdatoms, fr,
+                                mu_tot, enerd, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                p   = s_b->s.cg_p;
+                sf  = s_b->f;
+                gpb = 0;
+                for (i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(s_b, s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(s_b, s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(s_b, s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(s_b, s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(s_b, s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(s_min, s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fnormn = s_min->fnorm/sqrtNumAtoms;
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+
+/*! \brief Do L-BFGS conjugate gradients minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_lbfgs(FILE *fplog, t_commrec *cr,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+                int gmx_unused nstglobalcomm,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                int gmx_unused stepout,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                t_mdatoms *mdatoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                gmx_edsam_t gmx_unused ed,
+                t_forcerec *fr,
+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+                real gmx_unused cpt_period, real gmx_unused max_hours,
+                int imdport,
+                unsigned long gmx_unused Flags,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    rvec              *f;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    real               fnorm, fmax;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, nfmax, gf, step;
+    int                mdof_flags;
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
+    }
+
+    if (NULL != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    /* Allocate memory */
+    /* Use pointers to real so we dont have to loop over both atoms and
+     * dimensions all the time...
+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
+     * that point to the same memory.
+     */
+    snew(xa, n);
+    snew(xb, n);
+    snew(xc, n);
+    snew(fa, n);
+    snew(fb, n);
+    snew(fc, n);
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(lastx, n);
+    snew(lastf, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, LBFGS, cr, inputrec,
+            state_global, top_global, &ems, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
+     * so we free some memory again.
+     */
+    sfree(ems.s.x);
+    sfree(ems.f);
+
+    xx = (real *)state_global->x;
+    ff = (real *)f;
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x, 1, NULL,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    ems.s.x = state_global->x;
+    ems.f   = f;
+    evaluate_energy(fplog, cr,
+                    top_global, &ems, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+    where();
+
+    /* This is the starting energy */
+    Epot = enerd->term[F_EPOT];
+
+    fnorm = ems.fnorm;
+    fmax  = ems.fmax;
+    nfmax = ems.a_fmax;
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = ff[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, (real)step, state_global, state_global, f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        for (i = 0; i < n; i++)
+        {
+            lastx[i] = xx[i];
+            lastf[i] = ff[i];
+        }
+        Epot0 = Epot;
+
+        for (i = 0; i < n; i++)
+        {
+            xa[i] = xx[i];
+        }
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        EpotA      = Epot0;
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        ems.s.x = (rvec *)xc;
+        ems.f   = (rvec *)fc;
+        evaluate_energy(fplog, cr,
+                        top_global, &ems, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, step, FALSE);
+        EpotC = ems.epot;
+
+        // Calc line gradient in position C
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
+        {
+            // Great, we found a better energy. We no longer try to alter the
+            // stepsize, but simply accept this new better position. The we select a new
+            // search direction instead, which will be much more efficient than continuing
+            // to take smaller steps along a line. Set fnorm based on the new C position,
+            // which will be used to update the stepsize to 1/fnorm further down.
+            foundlower = TRUE;
+            fnorm      = ems.fnorm;
+        }
+        else
+        {
+            // If we got here, the energy is NOT lower in point C, i.e. it will be the same
+            // or higher than in point A. In this case it is pointless to move to point C,
+            // so we will have to do more iterations along the same line to find a smaller
+            // value in the interval [A=0.0,C].
+            // Here, A is still 0.0, but that will change when we do a search in the interval
+            // [0.0,C] below. That search we will do by interpolation or bisection rather
+            // than with the stepsize, so no need to modify it. For the next search direction
+            // it will be reset to 1/fnorm anyway.
+            foundlower = FALSE;
+        }
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            nminstep = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                ems.s.x = (rvec *)xb;
+                ems.f   = (rvec *)fb;
+                evaluate_energy(fplog, cr,
+                                top_global, &ems, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdatoms, fr,
+                                mu_tot, enerd, vir, pres, step, FALSE);
+                EpotB = ems.epot;
+                fnorm = ems.fnorm;
+
+                // Calculate gradient in point B
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    EpotC = EpotB;
+                    c     = b;
+                    gpc   = gpb;
+                    /* swap coord pointers b/c */
+                    xtmp = xb;
+                    ftmp = fb;
+                    xb   = xc;
+                    fb   = fc;
+                    xc   = xtmp;
+                    fc   = ftmp;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    EpotA = EpotB;
+                    a     = b;
+                    gpa   = gpb;
+                    /* swap coord pointers a/b */
+                    xtmp = xb;
+                    ftmp = fb;
+                    xb   = xa;
+                    fb   = fa;
+                    xa   = xtmp;
+                    fa   = ftmp;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
+
+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (EpotC < EpotA)
+            {
+                Epot = EpotC;
+                /* Use state C */
+                for (i = 0; i < n; i++)
+                {
+                    xx[i] = xc[i];
+                    ff[i] = fc[i];
+                }
+                step_taken = c;
+            }
+            else
+            {
+                Epot = EpotA;
+                /* Use state A */
+                for (i = 0; i < n; i++)
+                {
+                    xx[i] = xa[i];
+                    ff[i] = fa[i];
+                }
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            Epot = EpotC;
+            /* Use state C */
+            for (i = 0; i < n; i++)
+            {
+                xx[i] = xc[i];
+                ff[i] = fc[i];
+            }
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Test whether the convergence criterion is met */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, Epot, fnorm/sqrtNumAtoms, fmax, nfmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, Epot, fmax, nfmax, fnorm/sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, Epot, fmax, nfmax, fnorm/sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do steepest descents minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_steep(FILE *fplog, t_commrec *cr,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+                int gmx_unused nstglobalcomm,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                int gmx_unused stepout,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                t_mdatoms *mdatoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                gmx_edsam_t gmx_unused  ed,
+                t_forcerec *fr,
+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+                real gmx_unused cpt_period, real gmx_unused max_hours,
+                int imdport,
+                unsigned long gmx_unused Flags,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *SD = "Steepest Descents";
+    em_state_t       *s_min, *s_try;
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    rvec             *f;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep, fnormn;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+
+    s_min = init_em_state();
+    s_try = init_em_state();
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, SD, cr, inputrec,
+            state_global, top_global, s_try, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        if (count > 0)
+        {
+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
+                       s_min, stepsize, s_min->f, s_try,
+                       constr, top, nrnb, wcycle, count);
+        }
+
+        evaluate_energy(fplog, cr,
+                        top_global, s_try, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, count, count == 0);
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(s_min, s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
+                          top_global, inputrec, count,
+                          s_min, state_global);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                                       s_min, top, mdatoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fnormn = s_min->fnorm/sqrtNumAtoms;
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do normal modes analysis
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_nm(FILE *fplog, t_commrec *cr,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+             int gmx_unused nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int gmx_unused stepout,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t  gmx_unused ed,
+             t_forcerec *fr,
+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+             real gmx_unused cpt_period, real gmx_unused max_hours,
+             int imdport,
+             unsigned long gmx_unused Flags,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    rvec                *f;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *fneg, *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = NULL;
+    real           *     full_matrix             = NULL;
+    em_state_t       *   state_work;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+
+    if (constr != NULL)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    state_work = init_em_state();
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, NM, cr, inputrec,
+            state_global, top_global, state_work, &top,
+            &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
+
+    gmx_shellfc_t *shellfc = init_shell_flexcon(stdout,
+                                                top_global,
+                                                n_flexible_constraints(constr),
+                                                inputrec->nstcalcenergy,
+                                                DOMAINDECOMP(cr));
+
+    if (shellfc)
+    {
+        make_local_shells(cr, mdatoms, shellfc);
+    }
+    std::vector<size_t> atom_index = get_atom_index(top_global);
+    snew(fneg, atom_index.size());
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
+    {
+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+    where();
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
+                *(top_global->name), (int)inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    evaluate_energy(fplog, cr,
+                    top_global, state_work, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
+
+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
+    if (state_work->fmax > 1.0e-3)
+    {
+        md_print_info(cr, fplog,
+                      "The force is probably not small enough to "
+                      "ensure that you are at a minimum.\n"
+                      "Be aware that negative eigenvalues may occur\n"
+                      "when the resulting matrix is diagonalized.\n\n");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS = true;
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            gmx_bool    bBornRadii  = FALSE;
+            gmx_int64_t step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work->s.x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work->s.x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work->s.x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    (void) relax_shell_flexcon(fplog, cr, bVerbose, step,
+                                               inputrec, bNS, force_flags,
+                                               top,
+                                               constr, enerd, fcd,
+                                               &state_work->s, state_work->f, vir, mdatoms,
+                                               nrnb, wcycle, graph, &top_global->groups,
+                                               shellfc, fr, bBornRadii, t, mu_tot,
+                                               vsite, NULL);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    evaluate_energy(fplog, cr,
+                                    top_global, state_work, top,
+                                    inputrec, nrnb, wcycle, gstat,
+                                    vsite, constr, fcd, graph, mdatoms, fr,
+                                    mu_tot, enerd, vir, pres, atom*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    for (size_t i = 0; i < atom_index.size(); i++)
+                    {
+                        copy_rvec(state_work->f[atom_index[i]], fneg[i]);
+                    }
+                }
+            }
+
+            /* x is restored to original */
+            state_work->s.x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work->f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && atom+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (atom + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (bVerbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && bVerbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+
+    return 0;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..5a205694090705e66fb3b3a07da9748f185c8bd7
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/gromacs/mdlib/minimize.cpp.preplumed
@@ -0,0 +1,3054 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines integrators for energy minimization
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "minimize.h"
+
+#include "config.h"
+
+#include <cmath>
+#include <cstring>
+#include <ctime>
+
+#include <algorithm>
+#include <vector>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/mtxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/linearalgebra/sparsematrix.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+//! Utility structure for manipulating states during EM
+typedef struct {
+    //! Copy of the global state
+    t_state  s;
+    //! Force array
+    rvec    *f;
+    //! Potential energy
+    real     epot;
+    //! Norm of the force
+    real     fnorm;
+    //! Maximum force
+    real     fmax;
+    //! Direction
+    int      a_fmax;
+} em_state_t;
+
+//! Initiate em_state_t structure and return pointer to it
+static em_state_t *init_em_state()
+{
+    em_state_t *ems;
+
+    snew(ems, 1);
+
+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
+    snew(ems->s.lambda, efptNR);
+
+    return ems;
+}
+
+//! Print the EM starting conditions
+static void print_em_start(FILE                     *fplog,
+                           t_commrec                *cr,
+                           gmx_walltime_accounting_t walltime_accounting,
+                           gmx_wallcycle_t           wcycle,
+                           const char               *name)
+{
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, name);
+}
+
+//! Stop counting time for EM
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
+                        gmx_wallcycle_t           wcycle)
+{
+    wallcycle_stop(wcycle, ewcRUN);
+
+    walltime_accounting_end(walltime_accounting);
+}
+
+//! Printing a log file and console header
+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
+{
+    fprintf(out, "\n");
+    fprintf(out, "%s:\n", minimizer);
+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
+}
+
+//! Print warning message
+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
+{
+    char buffer[2048];
+    if (bLastStep)
+    {
+        sprintf(buffer,
+                "\nEnergy minimization reached the maximum number "
+                "of steps before the forces reached the requested "
+                "precision Fmax < %g.\n", ftol);
+    }
+    else
+    {
+        sprintf(buffer,
+                "\nEnergy minimization has stopped, but the forces have "
+                "not converged to the requested precision Fmax < %g (which "
+                "may not be possible for your system). It stopped "
+                "because the algorithm tried to make a new step whose size "
+                "was too small, or there was no change in the energy since "
+                "last step. Either way, we regard the minimization as "
+                "converged to within the available machine precision, "
+                "given your starting configuration and EM parameters.\n%s%s",
+                ftol,
+                sizeof(real) < sizeof(double) ?
+                "\nDouble precision normally gives you higher accuracy, but "
+                "this is often not needed for preparing to run molecular "
+                "dynamics.\n" :
+                "",
+                bConstrain ?
+                "You might need to increase your constraint accuracy, or turn\n"
+                "off constraints altogether (set constraints = none in mdp file)\n" :
+                "");
+    }
+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
+}
+
+//! Print message about convergence of the EM
+static void print_converged(FILE *fp, const char *alg, real ftol,
+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
+                            real epot, real fmax, int nfmax, real fnorm)
+{
+    char buf[STEPSTRSIZE];
+
+    if (bDone)
+    {
+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+    else if (count < nsteps)
+    {
+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
+                "but did not reach the requested Fmax < %g.\n",
+                alg, gmx_step_str(count, buf), ftol);
+    }
+    else
+    {
+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
+                alg, ftol, gmx_step_str(count, buf));
+    }
+
+#if GMX_DOUBLE
+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
+#else
+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
+#endif
+}
+
+//! Compute the norm and max of the force array in parallel
+static void get_f_norm_max(t_commrec *cr,
+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
+                           real *fnorm, real *fmax, int *a_fmax)
+{
+    double fnorm2, *sum;
+    real   fmax2, fam;
+    int    la_max, a_max, start, end, i, m, gf;
+
+    /* This routine finds the largest force and returns it.
+     * On parallel machines the global max is taken.
+     */
+    fnorm2 = 0;
+    fmax2  = 0;
+    la_max = -1;
+    start  = 0;
+    end    = mdatoms->homenr;
+    if (mdatoms->cFREEZE)
+    {
+        for (i = start; i < end; i++)
+        {
+            gf  = mdatoms->cFREEZE[i];
+            fam = 0;
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    fam += gmx::square(f[i][m]);
+                }
+            }
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+    else
+    {
+        for (i = start; i < end; i++)
+        {
+            fam     = norm2(f[i]);
+            fnorm2 += fam;
+            if (fam > fmax2)
+            {
+                fmax2  = fam;
+                la_max = i;
+            }
+        }
+    }
+
+    if (la_max >= 0 && DOMAINDECOMP(cr))
+    {
+        a_max = cr->dd->gatindex[la_max];
+    }
+    else
+    {
+        a_max = la_max;
+    }
+    if (PAR(cr))
+    {
+        snew(sum, 2*cr->nnodes+1);
+        sum[2*cr->nodeid]   = fmax2;
+        sum[2*cr->nodeid+1] = a_max;
+        sum[2*cr->nnodes]   = fnorm2;
+        gmx_sumd(2*cr->nnodes+1, sum, cr);
+        fnorm2 = sum[2*cr->nnodes];
+        /* Determine the global maximum */
+        for (i = 0; i < cr->nnodes; i++)
+        {
+            if (sum[2*i] > fmax2)
+            {
+                fmax2 = sum[2*i];
+                a_max = (int)(sum[2*i+1] + 0.5);
+            }
+        }
+        sfree(sum);
+    }
+
+    if (fnorm)
+    {
+        *fnorm = sqrt(fnorm2);
+    }
+    if (fmax)
+    {
+        *fmax  = sqrt(fmax2);
+    }
+    if (a_fmax)
+    {
+        *a_fmax = a_max;
+    }
+}
+
+//! Compute the norm of the force
+static void get_state_f_norm_max(t_commrec *cr,
+                                 t_grpopts *opts, t_mdatoms *mdatoms,
+                                 em_state_t *ems)
+{
+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
+}
+
+//! Initialize the energy minimization
+void init_em(FILE *fplog, const char *title,
+             t_commrec *cr, t_inputrec *ir,
+             t_state *state_global, gmx_mtop_t *top_global,
+             em_state_t *ems, gmx_localtop_t **top,
+             rvec **f,
+             t_nrnb *nrnb, rvec mu_tot,
+             t_forcerec *fr, gmx_enerdata_t **enerd,
+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int nfile, const t_filenm fnm[],
+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
+             int imdport, unsigned long gmx_unused Flags,
+             gmx_wallcycle_t wcycle)
+{
+    int  i;
+    real dvdl_constr;
+
+    if (fplog)
+    {
+        fprintf(fplog, "Initiating %s\n", title);
+    }
+
+    state_global->ngtc = 0;
+
+    /* Initialize lambda variables */
+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
+
+    init_nrnb(nrnb);
+
+    /* Interactive molecular dynamics */
+    init_IMD(ir, cr, top_global, fplog, 1, state_global->x,
+             nfile, fnm, NULL, imdport, Flags);
+
+    if (DOMAINDECOMP(cr))
+    {
+        *top = dd_init_local_top(top_global);
+
+        dd_init_local_state(cr->dd, state_global, &ems->s);
+
+        *f = NULL;
+
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            &ems->s, &ems->f, mdatoms, *top,
+                            fr, vsite, constr,
+                            nrnb, NULL, FALSE);
+        dd_store_state(cr->dd, &ems->s);
+
+        *graph = NULL;
+    }
+    else
+    {
+        snew(*f, top_global->natoms);
+
+        /* Just copy the state */
+        ems->s = *state_global;
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        snew(ems->s.x, ems->s.nalloc + 1);
+        snew(ems->f, ems->s.nalloc+1);
+        snew(ems->s.v, ems->s.nalloc+1);
+        for (i = 0; i < state_global->natoms; i++)
+        {
+            copy_rvec(state_global->x[i], ems->s.x[i]);
+        }
+        copy_mat(state_global->box, ems->s.box);
+
+        *top      = gmx_mtop_generate_local_top(top_global, ir->efep != efepNO);
+
+        forcerec_set_excl_load(fr, *top);
+
+        setup_bonded_threading(fr, &(*top)->idef);
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+        else
+        {
+            *graph = NULL;
+        }
+
+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, *top, mdatoms, cr);
+        }
+    }
+
+    if (constr)
+    {
+        if (ir->eConstrAlg == econtSHAKE &&
+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
+        {
+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
+        }
+
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, *top, ir, mdatoms, cr);
+        }
+
+        if (!ir->bContinuation)
+        {
+            /* Constrain the starting coordinates */
+            dvdl_constr = 0;
+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
+                      ir, cr, -1, 0, 1.0, mdatoms,
+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
+                      ems->s.lambda[efptFEP], &dvdl_constr,
+                      NULL, NULL, nrnb, econqCoord);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        *gstat = global_stat_init(ir);
+    }
+    else
+    {
+        *gstat = NULL;
+    }
+
+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
+
+    snew(*enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  *enerd);
+
+    if (mdebin != NULL)
+    {
+        /* Init bin for energy stuff */
+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, NULL);
+    }
+
+    clear_rvec(mu_tot);
+    calc_shifts(ems->s.box, fr->shift_vec);
+}
+
+//! Finalize the minimization
+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
+                      gmx_walltime_accounting_t walltime_accounting,
+                      gmx_wallcycle_t wcycle)
+{
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    done_mdoutf(outf);
+
+    em_time_end(walltime_accounting, wcycle);
+}
+
+//! Swap two different EM states during minimization
+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
+{
+    em_state_t tmp;
+
+    tmp   = *ems1;
+    *ems1 = *ems2;
+    *ems2 = tmp;
+}
+
+//! Copy coordinate from an EM state to a "normal" state structure
+static void copy_em_coords(em_state_t *ems, t_state *state)
+{
+    int i;
+
+    for (i = 0; (i < state->natoms); i++)
+    {
+        copy_rvec(ems->s.x[i], state->x[i]);
+    }
+}
+
+//! Save the EM trajectory
+static void write_em_traj(FILE *fplog, t_commrec *cr,
+                          gmx_mdoutf_t outf,
+                          gmx_bool bX, gmx_bool bF, const char *confout,
+                          gmx_mtop_t *top_global,
+                          t_inputrec *ir, gmx_int64_t step,
+                          em_state_t *state,
+                          t_state *state_global)
+{
+    int      mdof_flags;
+    gmx_bool bIMDout = FALSE;
+
+
+    /* Shall we do IMD output? */
+    if (ir->bIMD)
+    {
+        bIMDout = do_per_step(step, IMD_get_step(ir->imd->setup));
+    }
+
+    if ((bX || bF || bIMDout || confout != NULL) && !DOMAINDECOMP(cr))
+    {
+        copy_em_coords(state, state_global);
+    }
+
+    mdof_flags = 0;
+    if (bX)
+    {
+        mdof_flags |= MDOF_X;
+    }
+    if (bF)
+    {
+        mdof_flags |= MDOF_F;
+    }
+
+    /* If we want IMD output, set appropriate MDOF flag */
+    if (ir->bIMD)
+    {
+        mdof_flags |= MDOF_IMD;
+    }
+
+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                     top_global, step, (double)step,
+                                     &state->s, state_global, state->f);
+
+    if (confout != NULL && MASTER(cr))
+    {
+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
+        {
+            /* Make molecules whole only for confout writing */
+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
+                        state_global->x);
+        }
+
+        write_sto_conf_mtop(confout,
+                            *top_global->name, top_global,
+                            state_global->x, NULL, ir->ePBC, state_global->box);
+    }
+}
+
+//! Do one minimization step
+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
+                       gmx_bool bMolPBC,
+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
+                       gmx_constr_t constr, gmx_localtop_t *top,
+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                       gmx_int64_t count)
+
+{
+    t_state *s1, *s2;
+    int      i;
+    int      start, end;
+    rvec    *x1, *x2;
+    real     dvdl_constr;
+    int      nthreads gmx_unused;
+
+    s1 = &ems1->s;
+    s2 = &ems2->s;
+
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
+
+    s2->flags = s1->flags;
+
+    if (s2->nalloc != s1->nalloc)
+    {
+        s2->nalloc = s1->nalloc;
+        /* We need to allocate one element extra, since we might use
+         * (unaligned) 4-wide SIMD loads to access rvec entries.
+         */
+        srenew(s2->x, s1->nalloc + 1);
+        srenew(ems2->f,  s1->nalloc);
+        if (s2->flags & (1<<estCGP))
+        {
+            srenew(s2->cg_p,  s1->nalloc + 1);
+        }
+    }
+
+    s2->natoms = s1->natoms;
+    copy_mat(s1->box, s2->box);
+    /* Copy free energy state */
+    for (i = 0; i < efptNR; i++)
+    {
+        s2->lambda[i] = s1->lambda[i];
+    }
+    copy_mat(s1->box, s2->box);
+
+    start = 0;
+    end   = md->homenr;
+
+    x1 = s1->x;
+    x2 = s2->x;
+
+    // cppcheck-suppress unreadVariable
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
+    {
+        int gf, i, m;
+
+        gf = 0;
+#pragma omp for schedule(static) nowait
+        for (i = start; i < end; i++)
+        {
+            try
+            {
+                if (md->cFREEZE)
+                {
+                    gf = md->cFREEZE[i];
+                }
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[gf][m])
+                    {
+                        x2[i][m] = x1[i][m];
+                    }
+                    else
+                    {
+                        x2[i][m] = x1[i][m] + a*f[i][m];
+                    }
+                }
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+        }
+
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            x1 = s1->cg_p;
+            x2 = s2->cg_p;
+#pragma omp for schedule(static) nowait
+            for (i = start; i < end; i++)
+            {
+                // Trivial OpenMP block that does not throw
+                copy_rvec(x1[i], x2[i]);
+            }
+        }
+
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
+            {
+#pragma omp barrier
+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
+                try
+                {
+                    /* We need to allocate one element extra, since we might use
+                     * (unaligned) 4-wide SIMD loads to access rvec entries.
+                     */
+                    srenew(s2->cg_gl, s2->cg_gl_nalloc + 1);
+                }
+                GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+#pragma omp barrier
+            }
+            s2->ncg_gl = s1->ncg_gl;
+#pragma omp for schedule(static) nowait
+            for (i = 0; i < s2->ncg_gl; i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+
+    if (constr)
+    {
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
+                  ir, cr, count, 0, 1.0, md,
+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
+                  s2->lambda[efptBONDED], &dvdl_constr,
+                  NULL, NULL, nrnb, econqCoord);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+}
+
+//! Prepare EM for using domain decomposition parallellization
+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
+                                   gmx_mtop_t *top_global, t_inputrec *ir,
+                                   em_state_t *ems, gmx_localtop_t *top,
+                                   t_mdatoms *mdatoms, t_forcerec *fr,
+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
+{
+    /* Repartition the domain decomposition */
+    dd_partition_system(fplog, step, cr, FALSE, 1,
+                        NULL, top_global, ir,
+                        &ems->s, &ems->f,
+                        mdatoms, top, fr, vsite, constr,
+                        nrnb, wcycle, FALSE);
+    dd_store_state(cr->dd, &ems->s);
+}
+
+//! De one energy evaluation
+static void evaluate_energy(FILE *fplog, t_commrec *cr,
+                            gmx_mtop_t *top_global,
+                            em_state_t *ems, gmx_localtop_t *top,
+                            t_inputrec *inputrec,
+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                            gmx_global_stat_t gstat,
+                            gmx_vsite_t *vsite, gmx_constr_t constr,
+                            t_fcdata *fcd,
+                            t_graph *graph, t_mdatoms *mdatoms,
+                            t_forcerec *fr, rvec mu_tot,
+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
+                            gmx_int64_t count, gmx_bool bFirst)
+{
+    real     t;
+    gmx_bool bNS;
+    tensor   force_vir, shake_vir, ekin;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
+    real     terminate = 0;
+
+    /* Set the time to the initial time, the time does not change during EM */
+    t = inputrec->init_t;
+
+    if (bFirst ||
+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
+    {
+        /* This is the first state or an old state used before the last ns */
+        bNS = TRUE;
+    }
+    else
+    {
+        bNS = FALSE;
+        if (inputrec->nstlist > 0)
+        {
+            bNS = TRUE;
+        }
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, ems->s.x, 1, NULL,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
+    }
+
+    if (DOMAINDECOMP(cr) && bNS)
+    {
+        /* Repartition the domain decomposition */
+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                               ems, top, mdatoms, fr, vsite, constr,
+                               nrnb, wcycle);
+    }
+
+    /* Calc force & energy on new trial position  */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    do_force(fplog, cr, inputrec,
+             count, nrnb, wcycle, top, &top_global->groups,
+             ems->s.box, ems->s.x, &ems->s.hist,
+             ems->f, force_vir, mdatoms, enerd, fcd,
+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
+             (bNS ? GMX_FORCE_NS : 0));
+
+    /* Clear the unused shake virial and pressure */
+    clear_mat(shake_vir);
+    clear_mat(pres);
+
+    /* Communicate stuff when parallel */
+    if (PAR(cr) && inputrec->eI != eiNM)
+    {
+        wallcycle_start(wcycle, ewcMoveE);
+
+        global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
+                    inputrec, NULL, NULL, NULL, 1, &terminate,
+                    NULL, FALSE,
+                    CGLO_ENERGY |
+                    CGLO_PRESSURE |
+                    CGLO_CONSTRAINT);
+
+        wallcycle_stop(wcycle, ewcMoveE);
+    }
+
+    /* Calculate long range corrections to pressure and energy */
+    calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW],
+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
+    enerd->term[F_DISPCORR] = enercorr;
+    enerd->term[F_EPOT]    += enercorr;
+    enerd->term[F_PRES]    += prescorr;
+    enerd->term[F_DVDL]    += dvdlcorr;
+
+    ems->epot = enerd->term[F_EPOT];
+
+    if (constr)
+    {
+        /* Project out the constraint components of the force */
+        wallcycle_start(wcycle, ewcCONSTR);
+        dvdl_constr = 0;
+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
+                  inputrec, cr, count, 0, 1.0, mdatoms,
+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
+                  ems->s.lambda[efptBONDED], &dvdl_constr,
+                  NULL, &shake_vir, nrnb, econqForceDispl);
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+        m_add(force_vir, shake_vir, vir);
+        wallcycle_stop(wcycle, ewcCONSTR);
+    }
+    else
+    {
+        copy_mat(force_vir, vir);
+    }
+
+    clear_mat(ekin);
+    enerd->term[F_PRES] =
+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
+
+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
+
+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
+    {
+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
+    }
+}
+
+//! Parallel utility summing energies and forces
+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                              gmx_mtop_t *top_global,
+                              em_state_t *s_min, em_state_t *s_b)
+{
+    rvec          *fm, *fb, *fmg;
+    t_block       *cgs_gl;
+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
+    double         partsum;
+    unsigned char *grpnrFREEZE;
+
+    if (debug)
+    {
+        fprintf(debug, "Doing reorder_partsum\n");
+    }
+
+    fm = s_min->f;
+    fb = s_b->f;
+
+    cgs_gl = dd_charge_groups_global(cr->dd);
+    index  = cgs_gl->index;
+
+    /* Collect fm in a global vector fmg.
+     * This conflicts with the spirit of domain decomposition,
+     * but to fully optimize this a much more complicated algorithm is required.
+     */
+    snew(fmg, top_global->natoms);
+
+    ncg   = s_min->s.ncg_gl;
+    cg_gl = s_min->s.cg_gl;
+    i     = 0;
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            copy_rvec(fm[i], fmg[a]);
+            i++;
+        }
+    }
+    gmx_sum(top_global->natoms*3, fmg[0], cr);
+
+    /* Now we will determine the part of the sum for the cgs in state s_b */
+    ncg         = s_b->s.ncg_gl;
+    cg_gl       = s_b->s.cg_gl;
+    partsum     = 0;
+    i           = 0;
+    gf          = 0;
+    grpnrFREEZE = top_global->groups.grpnr[egcFREEZE];
+    for (c = 0; c < ncg; c++)
+    {
+        cg = cg_gl[c];
+        a0 = index[cg];
+        a1 = index[cg+1];
+        for (a = a0; a < a1; a++)
+        {
+            if (mdatoms->cFREEZE && grpnrFREEZE)
+            {
+                gf = grpnrFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
+                }
+            }
+            i++;
+        }
+    }
+
+    sfree(fmg);
+
+    return partsum;
+}
+
+//! Print some stuff, like beta, whatever that means.
+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
+                    gmx_mtop_t *top_global,
+                    em_state_t *s_min, em_state_t *s_b)
+{
+    rvec  *fm, *fb;
+    double sum;
+    int    gf, i, m;
+
+    /* This is just the classical Polak-Ribiere calculation of beta;
+     * it looks a bit complicated since we take freeze groups into account,
+     * and might have to sum it in parallel runs.
+     */
+
+    if (!DOMAINDECOMP(cr) ||
+        (s_min->s.ddp_count == cr->dd->ddp_count &&
+         s_b->s.ddp_count   == cr->dd->ddp_count))
+    {
+        fm  = s_min->f;
+        fb  = s_b->f;
+        sum = 0;
+        gf  = 0;
+        /* This part of code can be incorrect with DD,
+         * since the atom ordering in s_b and s_min might differ.
+         */
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!opts->nFreeze[gf][m])
+                {
+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
+                }
+            }
+        }
+    }
+    else
+    {
+        /* We need to reorder cgs while summing */
+        sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b);
+    }
+    if (PAR(cr))
+    {
+        gmx_sumd(1, &sum, cr);
+    }
+
+    return sum/gmx::square(s_min->fnorm);
+}
+
+namespace gmx
+{
+
+/*! \brief Do conjugate gradients minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_cg(FILE *fplog, t_commrec *cr,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+             int gmx_unused nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int gmx_unused stepout,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t gmx_unused ed,
+             t_forcerec *fr,
+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+             real gmx_unused cpt_period, real gmx_unused max_hours,
+             int imdport,
+             unsigned long gmx_unused Flags,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
+
+    em_state_t       *s_min, *s_a, *s_b, *s_c;
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    rvec             *f;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    rvec             *p, *sf;
+    double            gpa, gpb, gpc, tmp, minstep;
+    real              fnormn;
+    real              stepsize;
+    real              a, b, c, beta = 0.0;
+    real              epot_repl = 0;
+    real              pnorm;
+    t_mdebin         *mdebin;
+    gmx_bool          converged, foundlower;
+    rvec              mu_tot;
+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
+    tensor            vir, pres;
+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
+    gmx_mdoutf_t      outf;
+    int               i, m, gf, step, nminstep;
+
+    step = 0;
+
+    s_min = init_em_state();
+    s_a   = init_em_state();
+    s_b   = init_em_state();
+    s_c   = init_em_state();
+
+    /* Init em and store the local state in s_min */
+    init_em(fplog, CG, cr, inputrec,
+            state_global, top_global, s_min, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole in congrad.c
+     */
+    evaluate_energy(fplog, cr,
+                    top_global, s_min, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+    where();
+
+    /* Estimate/guess the initial stepsize */
+    stepsize = inputrec->em_stepsize/s_min->fnorm;
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
+                s_min->fmax, s_min->a_fmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n",
+                s_min->fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+    /* Start the loop over CG steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* start taking steps in a new direction
+         * First time we enter the routine, beta=0, and the direction is
+         * simply the negative gradient.
+         */
+
+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
+        p   = s_min->s.cg_p;
+        sf  = s_min->f;
+        gpa = 0;
+        gf  = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            if (mdatoms->cFREEZE)
+            {
+                gf = mdatoms->cFREEZE[i];
+            }
+            for (m = 0; m < DIM; m++)
+            {
+                if (!inputrec->opts.nFreeze[gf][m])
+                {
+                    p[i][m] = sf[i][m] + beta*p[i][m];
+                    gpa    -= p[i][m]*sf[i][m];
+                    /* f is negative gradient, thus the sign */
+                }
+                else
+                {
+                    p[i][m] = 0;
+                }
+            }
+        }
+
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpa, cr);
+        }
+
+        /* Calculate the norm of the search vector */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
+
+        /* Just in case stepsize reaches zero due to numerical precision... */
+        if (stepsize <= 0)
+        {
+            stepsize = inputrec->em_stepsize/pnorm;
+        }
+
+        /*
+         * Double check the value of the derivative in the search direction.
+         * If it is positive it must be due to the old information in the
+         * CG formula, so just remove that and start over with beta=0.
+         * This corresponds to a steepest descent step.
+         */
+        if (gpa > 0)
+        {
+            beta = 0;
+            step--;   /* Don't count this step since we are restarting */
+            continue; /* Go back to the beginning of the big for-loop */
+        }
+
+        /* Calculate minimum allowed stepsize, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        minstep = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                tmp = fabs(s_min->s.x[i][m]);
+                if (tmp < 1.0)
+                {
+                    tmp = 1.0;
+                }
+                tmp      = p[i][m]/tmp;
+                minstep += tmp*tmp;
+            }
+        }
+        /* Add up from all CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &minstep, cr);
+        }
+
+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
+                      top_global, inputrec, step,
+                      s_min, state_global);
+
+        /* Take a step downhill.
+         * In theory, we should minimize the function along this direction.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new CG step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * the continue straight to the next CG step without trying to find any minimum.
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+        s_a->epot = s_min->epot;
+        a         = 0.0;
+        c         = a + stepsize; /* reference position along line is zero */
+
+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
+        {
+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
+                                   s_min, top, mdatoms, fr, vsite, constr,
+                                   nrnb, wcycle);
+        }
+
+        /* Take a trial step (new coords in s_c) */
+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
+                   constr, top, nrnb, wcycle, -1);
+
+        neval++;
+        /* Calculate energy for the trial step */
+        evaluate_energy(fplog, cr,
+                        top_global, s_c, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, -1, FALSE);
+
+        /* Calc derivative along line */
+        p   = s_c->s.cg_p;
+        sf  = s_c->f;
+        gpc = 0;
+        for (i = 0; i < mdatoms->homenr; i++)
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
+            }
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        /* This is the max amount of increase in energy we tolerate */
+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
+
+        /* Accept the step if the energy is lower, or if it is not significantly higher
+         * and the line derivative is still negative.
+         */
+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
+        {
+            foundlower = TRUE;
+            /* Great, we found a better energy. Increase step for next iteration
+             * if we are still going down, decrease it otherwise
+             */
+            if (gpc < 0)
+            {
+                stepsize *= 1.618034; /* The golden section */
+            }
+            else
+            {
+                stepsize *= 0.618034; /* 1/golden section */
+            }
+        }
+        else
+        {
+            /* New energy is the same or higher. We will have to do some work
+             * to find a smaller value in the interval. Take smaller step next time!
+             */
+            foundlower = FALSE;
+            stepsize  *= 0.618034;
+        }
+
+
+
+
+        /* OK, if we didn't find a lower value we will have to locate one now - there must
+         * be one in the interval [a=0,c].
+         * The same thing is valid here, though: Don't spend dozens of iterations to find
+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
+         *
+         * I also have a safeguard for potentially really pathological functions so we never
+         * take more than 20 steps before we give up ...
+         *
+         * If we already found a lower value we just skip this step and continue to the update.
+         */
+        if (!foundlower)
+        {
+            nminstep = 0;
+
+            do
+            {
+                /* Select a new trial point.
+                 * If the derivatives at points a & c have different sign we interpolate to zero,
+                 * otherwise just do a bisection.
+                 */
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+                {
+                    /* Reload the old state */
+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
+                                           s_min, top, mdatoms, fr, vsite, constr,
+                                           nrnb, wcycle);
+                }
+
+                /* Take a trial step to this new point - new coords in s_b */
+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
+                           constr, top, nrnb, wcycle, -1);
+
+                neval++;
+                /* Calculate energy for the trial step */
+                evaluate_energy(fplog, cr,
+                                top_global, s_b, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdatoms, fr,
+                                mu_tot, enerd, vir, pres, -1, FALSE);
+
+                /* p does not change within a step, but since the domain decomposition
+                 * might change, we have to use cg_p of s_b here.
+                 */
+                p   = s_b->s.cg_p;
+                sf  = s_b->f;
+                gpb = 0;
+                for (i = 0; i < mdatoms->homenr; i++)
+                {
+                    for (m = 0; m < DIM; m++)
+                    {
+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
+                    }
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                if (debug)
+                {
+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
+                            s_a->epot, s_b->epot, s_c->epot, gpb);
+                }
+
+                epot_repl = s_b->epot;
+
+                /* Keep one of the intervals based on the value of the derivative at the new point */
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    swap_em_state(s_b, s_c);
+                    c   = b;
+                    gpc = gpb;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    swap_em_state(s_b, s_a);
+                    a   = b;
+                    gpa = gpb;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
+                   (nminstep < 20));
+
+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
+                nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If beta==0 this was steepest descent, and then we give up.
+                 * If not, set beta=0 and restart with steepest descent before quitting.
+                 */
+                if (beta == 0.0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory before giving up */
+                    beta = 0.0;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in B.
+             */
+            if (s_c->epot < s_a->epot)
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
+                            s_c->epot, s_a->epot);
+                }
+                swap_em_state(s_b, s_c);
+                gpb = gpc;
+            }
+            else
+            {
+                if (debug)
+                {
+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
+                            s_a->epot, s_c->epot);
+                }
+                swap_em_state(s_b, s_a);
+                gpb = gpa;
+            }
+
+        }
+        else
+        {
+            if (debug)
+            {
+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
+                        s_c->epot);
+            }
+            swap_em_state(s_b, s_c);
+            gpb = gpc;
+        }
+
+        /* new search direction */
+        /* beta = 0 means forget all memory and restart with steepest descents. */
+        if (nstcg && ((step % nstcg) == 0))
+        {
+            beta = 0.0;
+        }
+        else
+        {
+            /* s_min->fnorm cannot be zero, because then we would have converged
+             * and broken out.
+             */
+
+            /* Polak-Ribiere update.
+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
+             */
+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
+        }
+        /* Limit beta to prevent oscillations */
+        if (fabs(beta) > 5.0)
+        {
+            beta = 0.0;
+        }
+
+
+        /* update positions */
+        swap_em_state(s_min, s_b);
+        gpa = gpb;
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, s_min->epot, s_min->fnorm/sqrtNumAtoms,
+                        s_min->fmax, s_min->a_fmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+
+            /* Prepare IMD energy record, if bIMD is TRUE. */
+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
+
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+
+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (s_min->fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (s_min->fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        /* If we printed energy and/or logfile last step (which was the last step)
+         * we don't have to do it again, but otherwise print the final values.
+         */
+        if (!do_log)
+        {
+            /* Write final value to log since we didn't do anything the last step */
+            print_ebin_header(fplog, step, step);
+        }
+        if (!do_ene || !do_log)
+        {
+            /* Write final energy file entries */
+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
+
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  s_min, state_global);
+
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fnormn = s_min->fnorm/sqrtNumAtoms;
+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+
+/*! \brief Do L-BFGS conjugate gradients minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_lbfgs(FILE *fplog, t_commrec *cr,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+                int gmx_unused nstglobalcomm,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                int gmx_unused stepout,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                t_mdatoms *mdatoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                gmx_edsam_t gmx_unused ed,
+                t_forcerec *fr,
+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+                real gmx_unused cpt_period, real gmx_unused max_hours,
+                int imdport,
+                unsigned long gmx_unused Flags,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
+    em_state_t         ems;
+    gmx_localtop_t    *top;
+    gmx_enerdata_t    *enerd;
+    rvec              *f;
+    gmx_global_stat_t  gstat;
+    t_graph           *graph;
+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
+    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
+    real               a, b, c, maxdelta, delta;
+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
+    real               dgdx, dgdg, sq, yr, beta;
+    t_mdebin          *mdebin;
+    gmx_bool           converged;
+    rvec               mu_tot;
+    real               fnorm, fmax;
+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
+    tensor             vir, pres;
+    int                start, end, number_steps;
+    gmx_mdoutf_t       outf;
+    int                i, k, m, n, nfmax, gf, step;
+    int                mdof_flags;
+
+    if (PAR(cr))
+    {
+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
+    }
+
+    if (NULL != constr)
+    {
+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
+    }
+
+    n        = 3*state_global->natoms;
+    nmaxcorr = inputrec->nbfgscorr;
+
+    /* Allocate memory */
+    /* Use pointers to real so we dont have to loop over both atoms and
+     * dimensions all the time...
+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
+     * that point to the same memory.
+     */
+    snew(xa, n);
+    snew(xb, n);
+    snew(xc, n);
+    snew(fa, n);
+    snew(fb, n);
+    snew(fc, n);
+    snew(frozen, n);
+
+    snew(p, n);
+    snew(lastx, n);
+    snew(lastf, n);
+    snew(rho, nmaxcorr);
+    snew(alpha, nmaxcorr);
+
+    snew(dx, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dx[i], n);
+    }
+
+    snew(dg, nmaxcorr);
+    for (i = 0; i < nmaxcorr; i++)
+    {
+        snew(dg[i], n);
+    }
+
+    step  = 0;
+    neval = 0;
+
+    /* Init em */
+    init_em(fplog, LBFGS, cr, inputrec,
+            state_global, top_global, &ems, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
+     * so we free some memory again.
+     */
+    sfree(ems.s.x);
+    sfree(ems.f);
+
+    xx = (real *)state_global->x;
+    ff = (real *)f;
+
+    start = 0;
+    end   = mdatoms->homenr;
+
+    /* Print to log file */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
+
+    do_log = do_ene = do_x = do_f = TRUE;
+
+    /* Max number of steps */
+    number_steps = inputrec->nsteps;
+
+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
+    gf = 0;
+    for (i = start; i < end; i++)
+    {
+        if (mdatoms->cFREEZE)
+        {
+            gf = mdatoms->cFREEZE[i];
+        }
+        for (m = 0; m < DIM; m++)
+        {
+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
+        }
+    }
+    if (MASTER(cr))
+    {
+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
+    }
+
+    if (vsite)
+    {
+        construct_vsites(vsite, state_global->x, 1, NULL,
+                         top->idef.iparams, top->idef.il,
+                         fr->ePBC, fr->bMolPBC, cr, state_global->box);
+    }
+
+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
+    /* do_force always puts the charge groups in the box and shifts again
+     * We do not unshift, so molecules are always whole
+     */
+    neval++;
+    ems.s.x = state_global->x;
+    ems.f   = f;
+    evaluate_energy(fplog, cr,
+                    top_global, &ems, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    where();
+
+    if (MASTER(cr))
+    {
+        /* Copy stuff to the energy bin for easy printing etc. */
+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                   mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+        print_ebin_header(fplog, step, step);
+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+    where();
+
+    /* This is the starting energy */
+    Epot = enerd->term[F_EPOT];
+
+    fnorm = ems.fnorm;
+    fmax  = ems.fmax;
+    nfmax = ems.a_fmax;
+
+    /* Set the initial step.
+     * since it will be multiplied by the non-normalized search direction
+     * vector (force vector the first time), we scale it by the
+     * norm of the force.
+     */
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrtNumAtoms);
+        fprintf(stderr, "\n");
+        /* and copy to the log file too... */
+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrtNumAtoms);
+        fprintf(fplog, "\n");
+    }
+
+    // Point is an index to the memory of search directions, where 0 is the first one.
+    point = 0;
+
+    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
+    for (i = 0; i < n; i++)
+    {
+        if (!frozen[i])
+        {
+            dx[point][i] = ff[i]; /* Initial search direction */
+        }
+        else
+        {
+            dx[point][i] = 0;
+        }
+    }
+
+    // Stepsize will be modified during the search, and actually it is not critical
+    // (the main efficiency in the algorithm comes from changing directions), but
+    // we still need an initial value, so estimate it as the inverse of the norm
+    // so we take small steps where the potential fluctuates a lot.
+    stepsize  = 1.0/fnorm;
+
+    /* Start the loop over BFGS steps.
+     * Each successful step is counted, and we continue until
+     * we either converge or reach the max number of steps.
+     */
+
+    ncorr = 0;
+
+    /* Set the gradient from the force */
+    converged = FALSE;
+    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
+    {
+
+        /* Write coordinates if necessary */
+        do_x = do_per_step(step, inputrec->nstxout);
+        do_f = do_per_step(step, inputrec->nstfout);
+
+        mdof_flags = 0;
+        if (do_x)
+        {
+            mdof_flags |= MDOF_X;
+        }
+
+        if (do_f)
+        {
+            mdof_flags |= MDOF_F;
+        }
+
+        if (inputrec->bIMD)
+        {
+            mdof_flags |= MDOF_IMD;
+        }
+
+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
+                                         top_global, step, (real)step, state_global, state_global, f);
+
+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
+
+        /* make s a pointer to current search direction - point=0 first time we get here */
+        s = dx[point];
+
+        // calculate line gradient in position A
+        for (gpa = 0, i = 0; i < n; i++)
+        {
+            gpa -= s[i]*ff[i];
+        }
+
+        /* Calculate minimum allowed stepsize along the line, before the average (norm)
+         * relative change in coordinate is smaller than precision
+         */
+        for (minstep = 0, i = 0; i < n; i++)
+        {
+            tmp = fabs(xx[i]);
+            if (tmp < 1.0)
+            {
+                tmp = 1.0;
+            }
+            tmp      = s[i]/tmp;
+            minstep += tmp*tmp;
+        }
+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
+
+        if (stepsize < minstep)
+        {
+            converged = TRUE;
+            break;
+        }
+
+        // Before taking any steps along the line, store the old position
+        for (i = 0; i < n; i++)
+        {
+            lastx[i] = xx[i];
+            lastf[i] = ff[i];
+        }
+        Epot0 = Epot;
+
+        for (i = 0; i < n; i++)
+        {
+            xa[i] = xx[i];
+        }
+
+        /* Take a step downhill.
+         * In theory, we should find the actual minimum of the function in this
+         * direction, somewhere along the line.
+         * That is quite possible, but it turns out to take 5-10 function evaluations
+         * for each line. However, we dont really need to find the exact minimum -
+         * it is much better to start a new BFGS step in a modified direction as soon
+         * as we are close to it. This will save a lot of energy evaluations.
+         *
+         * In practice, we just try to take a single step.
+         * If it worked (i.e. lowered the energy), we increase the stepsize but
+         * continue straight to the next BFGS step without trying to find any minimum,
+         * i.e. we change the search direction too. If the line was smooth, it is
+         * likely we are in a smooth region, and then it makes sense to take longer
+         * steps in the modified search direction too.
+         *
+         * If it didn't work (higher energy), there must be a minimum somewhere between
+         * the old position and the new one. Then we need to start by finding a lower
+         * value before we change search direction. Since the energy was apparently
+         * quite rough, we need to decrease the step size.
+         *
+         * Due to the finite numerical accuracy, it turns out that it is a good idea
+         * to accept a SMALL increase in energy, if the derivative is still downhill.
+         * This leads to lower final energies in the tests I've done. / Erik
+         */
+
+        // State "A" is the first position along the line.
+        // reference position along line is initially zero
+        EpotA      = Epot0;
+        a          = 0.0;
+
+        // Check stepsize first. We do not allow displacements
+        // larger than emstep.
+        //
+        do
+        {
+            // Pick a new position C by adding stepsize to A.
+            c        = a + stepsize;
+
+            // Calculate what the largest change in any individual coordinate
+            // would be (translation along line * gradient along line)
+            maxdelta = 0;
+            for (i = 0; i < n; i++)
+            {
+                delta = c*s[i];
+                if (delta > maxdelta)
+                {
+                    maxdelta = delta;
+                }
+            }
+            // If any displacement is larger than the stepsize limit, reduce the step
+            if (maxdelta > inputrec->em_stepsize)
+            {
+                stepsize *= 0.1;
+            }
+        }
+        while (maxdelta > inputrec->em_stepsize);
+
+        // Take a trial step and move the coordinate array xc[] to position C
+        for (i = 0; i < n; i++)
+        {
+            xc[i] = lastx[i] + c*s[i];
+        }
+
+        neval++;
+        // Calculate energy for the trial step in position C
+        ems.s.x = (rvec *)xc;
+        ems.f   = (rvec *)fc;
+        evaluate_energy(fplog, cr,
+                        top_global, &ems, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, step, FALSE);
+        EpotC = ems.epot;
+
+        // Calc line gradient in position C
+        for (gpc = 0, i = 0; i < n; i++)
+        {
+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
+        }
+        /* Sum the gradient along the line across CPUs */
+        if (PAR(cr))
+        {
+            gmx_sumd(1, &gpc, cr);
+        }
+
+        // This is the max amount of increase in energy we tolerate.
+        // By allowing VERY small changes (close to numerical precision) we
+        // frequently find even better (lower) final energies.
+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
+
+        // Accept the step if the energy is lower in the new position C (compared to A),
+        // or if it is not significantly higher and the line derivative is still negative.
+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
+        {
+            // Great, we found a better energy. We no longer try to alter the
+            // stepsize, but simply accept this new better position. The we select a new
+            // search direction instead, which will be much more efficient than continuing
+            // to take smaller steps along a line. Set fnorm based on the new C position,
+            // which will be used to update the stepsize to 1/fnorm further down.
+            foundlower = TRUE;
+            fnorm      = ems.fnorm;
+        }
+        else
+        {
+            // If we got here, the energy is NOT lower in point C, i.e. it will be the same
+            // or higher than in point A. In this case it is pointless to move to point C,
+            // so we will have to do more iterations along the same line to find a smaller
+            // value in the interval [A=0.0,C].
+            // Here, A is still 0.0, but that will change when we do a search in the interval
+            // [0.0,C] below. That search we will do by interpolation or bisection rather
+            // than with the stepsize, so no need to modify it. For the next search direction
+            // it will be reset to 1/fnorm anyway.
+            foundlower = FALSE;
+        }
+
+        if (!foundlower)
+        {
+            // OK, if we didn't find a lower value we will have to locate one now - there must
+            // be one in the interval [a,c].
+            // The same thing is valid here, though: Don't spend dozens of iterations to find
+            // the line minimum. We try to interpolate based on the derivative at the endpoints,
+            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
+            // I also have a safeguard for potentially really pathological functions so we never
+            // take more than 20 steps before we give up.
+            // If we already found a lower value we just skip this step and continue to the update.
+            nminstep = 0;
+            do
+            {
+                // Select a new trial point B in the interval [A,C].
+                // If the derivatives at points a & c have different sign we interpolate to zero,
+                // otherwise just do a bisection since there might be multiple minima/maxima
+                // inside the interval.
+                if (gpa < 0 && gpc > 0)
+                {
+                    b = a + gpa*(a-c)/(gpc-gpa);
+                }
+                else
+                {
+                    b = 0.5*(a+c);
+                }
+
+                /* safeguard if interpolation close to machine accuracy causes errors:
+                 * never go outside the interval
+                 */
+                if (b <= a || b >= c)
+                {
+                    b = 0.5*(a+c);
+                }
+
+                // Take a trial step to point B
+                for (i = 0; i < n; i++)
+                {
+                    xb[i] = lastx[i] + b*s[i];
+                }
+
+                neval++;
+                // Calculate energy for the trial step in point B
+                ems.s.x = (rvec *)xb;
+                ems.f   = (rvec *)fb;
+                evaluate_energy(fplog, cr,
+                                top_global, &ems, top,
+                                inputrec, nrnb, wcycle, gstat,
+                                vsite, constr, fcd, graph, mdatoms, fr,
+                                mu_tot, enerd, vir, pres, step, FALSE);
+                EpotB = ems.epot;
+                fnorm = ems.fnorm;
+
+                // Calculate gradient in point B
+                for (gpb = 0, i = 0; i < n; i++)
+                {
+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
+
+                }
+                /* Sum the gradient along the line across CPUs */
+                if (PAR(cr))
+                {
+                    gmx_sumd(1, &gpb, cr);
+                }
+
+                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
+                // at the new point B, and rename the endpoints of this new interval A and C.
+                if (gpb > 0)
+                {
+                    /* Replace c endpoint with b */
+                    EpotC = EpotB;
+                    c     = b;
+                    gpc   = gpb;
+                    /* swap coord pointers b/c */
+                    xtmp = xb;
+                    ftmp = fb;
+                    xb   = xc;
+                    fb   = fc;
+                    xc   = xtmp;
+                    fc   = ftmp;
+                }
+                else
+                {
+                    /* Replace a endpoint with b */
+                    EpotA = EpotB;
+                    a     = b;
+                    gpa   = gpb;
+                    /* swap coord pointers a/b */
+                    xtmp = xb;
+                    ftmp = fb;
+                    xb   = xa;
+                    fb   = fa;
+                    xa   = xtmp;
+                    fa   = ftmp;
+                }
+
+                /*
+                 * Stop search as soon as we find a value smaller than the endpoints,
+                 * or if the tolerance is below machine precision.
+                 * Never run more than 20 steps, no matter what.
+                 */
+                nminstep++;
+            }
+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
+
+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
+            {
+                /* OK. We couldn't find a significantly lower energy.
+                 * If ncorr==0 this was steepest descent, and then we give up.
+                 * If not, reset memory to restart as steepest descent before quitting.
+                 */
+                if (ncorr == 0)
+                {
+                    /* Converged */
+                    converged = TRUE;
+                    break;
+                }
+                else
+                {
+                    /* Reset memory */
+                    ncorr = 0;
+                    /* Search in gradient direction */
+                    for (i = 0; i < n; i++)
+                    {
+                        dx[point][i] = ff[i];
+                    }
+                    /* Reset stepsize */
+                    stepsize = 1.0/fnorm;
+                    continue;
+                }
+            }
+
+            /* Select min energy state of A & C, put the best in xx/ff/Epot
+             */
+            if (EpotC < EpotA)
+            {
+                Epot = EpotC;
+                /* Use state C */
+                for (i = 0; i < n; i++)
+                {
+                    xx[i] = xc[i];
+                    ff[i] = fc[i];
+                }
+                step_taken = c;
+            }
+            else
+            {
+                Epot = EpotA;
+                /* Use state A */
+                for (i = 0; i < n; i++)
+                {
+                    xx[i] = xa[i];
+                    ff[i] = fa[i];
+                }
+                step_taken = a;
+            }
+
+        }
+        else
+        {
+            /* found lower */
+            Epot = EpotC;
+            /* Use state C */
+            for (i = 0; i < n; i++)
+            {
+                xx[i] = xc[i];
+                ff[i] = fc[i];
+            }
+            step_taken = c;
+        }
+
+        /* Update the memory information, and calculate a new
+         * approximation of the inverse hessian
+         */
+
+        /* Have new data in Epot, xx, ff */
+        if (ncorr < nmaxcorr)
+        {
+            ncorr++;
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            dg[point][i]  = lastf[i]-ff[i];
+            dx[point][i] *= step_taken;
+        }
+
+        dgdg = 0;
+        dgdx = 0;
+        for (i = 0; i < n; i++)
+        {
+            dgdg += dg[point][i]*dg[point][i];
+            dgdx += dg[point][i]*dx[point][i];
+        }
+
+        diag = dgdx/dgdg;
+
+        rho[point] = 1.0/dgdx;
+        point++;
+
+        if (point >= nmaxcorr)
+        {
+            point = 0;
+        }
+
+        /* Update */
+        for (i = 0; i < n; i++)
+        {
+            p[i] = ff[i];
+        }
+
+        cp = point;
+
+        /* Recursive update. First go back over the memory points */
+        for (k = 0; k < ncorr; k++)
+        {
+            cp--;
+            if (cp < 0)
+            {
+                cp = ncorr-1;
+            }
+
+            sq = 0;
+            for (i = 0; i < n; i++)
+            {
+                sq += dx[cp][i]*p[i];
+            }
+
+            alpha[cp] = rho[cp]*sq;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] -= alpha[cp]*dg[cp][i];
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            p[i] *= diag;
+        }
+
+        /* And then go forward again */
+        for (k = 0; k < ncorr; k++)
+        {
+            yr = 0;
+            for (i = 0; i < n; i++)
+            {
+                yr += p[i]*dg[cp][i];
+            }
+
+            beta = rho[cp]*yr;
+            beta = alpha[cp]-beta;
+
+            for (i = 0; i < n; i++)
+            {
+                p[i] += beta*dx[cp][i];
+            }
+
+            cp++;
+            if (cp >= ncorr)
+            {
+                cp = 0;
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+            if (!frozen[i])
+            {
+                dx[point][i] = p[i];
+            }
+            else
+            {
+                dx[point][i] = 0;
+            }
+        }
+
+        /* Test whether the convergence criterion is met */
+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
+
+        /* Print it if necessary */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
+                        step, Epot, fnorm/sqrtNumAtoms, fmax, nfmax+1);
+                fflush(stderr);
+            }
+            /* Store the new (lower) energies */
+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
+                       mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box,
+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
+            do_log = do_per_step(step, inputrec->nstlog);
+            do_ene = do_per_step(step, inputrec->nstenergy);
+            if (do_log)
+            {
+                print_ebin_header(fplog, step, step);
+            }
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
+                       do_log ? fplog : NULL, step, step, eprNORMAL,
+                       mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+        }
+
+        /* Send x and E to IMD client, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        // Reset stepsize in we are doing more iterations
+        stepsize = 1.0/fnorm;
+
+        /* Stop when the maximum force lies below tolerance.
+         * If we have reached machine precision, converged is already set to true.
+         */
+        converged = converged || (fmax < inputrec->em_tol);
+
+    }   /* End of the loop */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    if (converged)
+    {
+        step--; /* we never took that last step in this case */
+
+    }
+    if (fmax > inputrec->em_tol)
+    {
+        if (MASTER(cr))
+        {
+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
+        }
+        converged = FALSE;
+    }
+
+    /* If we printed energy and/or logfile last step (which was the last step)
+     * we don't have to do it again, but otherwise print the final values.
+     */
+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
+    {
+        print_ebin_header(fplog, step, step);
+    }
+    if (!do_ene || !do_log) /* Write final energy file entries */
+    {
+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
+                   mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+    }
+
+    /* Print some stuff... */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+
+    /* IMPORTANT!
+     * For accurate normal mode calculation it is imperative that we
+     * store the last conformation into the full precision binary trajectory.
+     *
+     * However, we should only do it if we did NOT already write this step
+     * above (which we did if do_x or do_f was true).
+     */
+    do_x = !do_per_step(step, inputrec->nstxout);
+    do_f = !do_per_step(step, inputrec->nstfout);
+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, step,
+                  &ems, state_global);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, Epot, fmax, nfmax, fnorm/sqrtNumAtoms);
+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
+                        number_steps, Epot, fmax, nfmax, fnorm/sqrtNumAtoms);
+
+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do steepest descents minimization
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_steep(FILE *fplog, t_commrec *cr,
+                int nfile, const t_filenm fnm[],
+                const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+                int gmx_unused nstglobalcomm,
+                gmx_vsite_t *vsite, gmx_constr_t constr,
+                int gmx_unused stepout,
+                t_inputrec *inputrec,
+                gmx_mtop_t *top_global, t_fcdata *fcd,
+                t_state *state_global,
+                t_mdatoms *mdatoms,
+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                gmx_edsam_t gmx_unused  ed,
+                t_forcerec *fr,
+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+                real gmx_unused cpt_period, real gmx_unused max_hours,
+                int imdport,
+                unsigned long gmx_unused Flags,
+                gmx_walltime_accounting_t walltime_accounting)
+{
+    const char       *SD = "Steepest Descents";
+    em_state_t       *s_min, *s_try;
+    gmx_localtop_t   *top;
+    gmx_enerdata_t   *enerd;
+    rvec             *f;
+    gmx_global_stat_t gstat;
+    t_graph          *graph;
+    real              stepsize;
+    real              ustep, fnormn;
+    gmx_mdoutf_t      outf;
+    t_mdebin         *mdebin;
+    gmx_bool          bDone, bAbort, do_x, do_f;
+    tensor            vir, pres;
+    rvec              mu_tot;
+    int               nsteps;
+    int               count          = 0;
+    int               steps_accepted = 0;
+
+    s_min = init_em_state();
+    s_try = init_em_state();
+
+    /* Init em and store the local state in s_try */
+    init_em(fplog, SD, cr, inputrec,
+            state_global, top_global, s_try, &top, &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
+
+    /* Print to log file  */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
+
+    /* Set variables for stepsize (in nm). This is the largest
+     * step that we are going to make in any direction.
+     */
+    ustep    = inputrec->em_stepsize;
+    stepsize = 0;
+
+    /* Max number of steps  */
+    nsteps = inputrec->nsteps;
+
+    if (MASTER(cr))
+    {
+        /* Print to the screen  */
+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
+    }
+    if (fplog)
+    {
+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
+    }
+
+    /**** HERE STARTS THE LOOP ****
+     * count is the counter for the number of steps
+     * bDone will be TRUE when the minimization has converged
+     * bAbort will be TRUE when nsteps steps have been performed or when
+     * the stepsize becomes smaller than is reasonable for machine precision
+     */
+    count  = 0;
+    bDone  = FALSE;
+    bAbort = FALSE;
+    while (!bDone && !bAbort)
+    {
+        bAbort = (nsteps >= 0) && (count == nsteps);
+
+        /* set new coordinates, except for first step */
+        if (count > 0)
+        {
+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
+                       s_min, stepsize, s_min->f, s_try,
+                       constr, top, nrnb, wcycle, count);
+        }
+
+        evaluate_energy(fplog, cr,
+                        top_global, s_try, top,
+                        inputrec, nrnb, wcycle, gstat,
+                        vsite, constr, fcd, graph, mdatoms, fr,
+                        mu_tot, enerd, vir, pres, count, count == 0);
+
+        if (MASTER(cr))
+        {
+            print_ebin_header(fplog, count, count);
+        }
+
+        if (count == 0)
+        {
+            s_min->epot = s_try->epot;
+        }
+
+        /* Print it if necessary  */
+        if (MASTER(cr))
+        {
+            if (bVerbose)
+            {
+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
+                        ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r');
+                fflush(stderr);
+            }
+
+            if ( (count == 0) || (s_try->epot < s_min->epot) )
+            {
+                /* Store the new (lower) energies  */
+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
+
+                /* Prepare IMD energy record, if bIMD is TRUE. */
+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
+
+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
+                           do_per_step(steps_accepted, inputrec->nstdisreout),
+                           do_per_step(steps_accepted, inputrec->nstorireout),
+                           fplog, count, count, eprNORMAL,
+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
+                fflush(fplog);
+            }
+        }
+
+        /* Now if the new energy is smaller than the previous...
+         * or if this is the first step!
+         * or if we did random steps!
+         */
+
+        if ( (count == 0) || (s_try->epot < s_min->epot) )
+        {
+            steps_accepted++;
+
+            /* Test whether the convergence criterion is met...  */
+            bDone = (s_try->fmax < inputrec->em_tol);
+
+            /* Copy the arrays for force, positions and energy  */
+            /* The 'Min' array always holds the coords and forces of the minimal
+               sampled energy  */
+            swap_em_state(s_min, s_try);
+            if (count > 0)
+            {
+                ustep *= 1.2;
+            }
+
+            /* Write to trn, if necessary */
+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
+                          top_global, inputrec, count,
+                          s_min, state_global);
+        }
+        else
+        {
+            /* If energy is not smaller make the step smaller...  */
+            ustep *= 0.5;
+
+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
+            {
+                /* Reload the old state */
+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
+                                       s_min, top, mdatoms, fr, vsite, constr,
+                                       nrnb, wcycle);
+            }
+        }
+
+        /* Determine new step  */
+        stepsize = ustep/s_min->fmax;
+
+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
+#if GMX_DOUBLE
+        if (count == nsteps || ustep < 1e-12)
+#else
+        if (count == nsteps || ustep < 1e-6)
+#endif
+        {
+            if (MASTER(cr))
+            {
+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
+            }
+            bAbort = TRUE;
+        }
+
+        /* Send IMD energies and positions, if bIMD is TRUE. */
+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
+        {
+            IMD_send_positions(inputrec->imd);
+        }
+
+        count++;
+    }   /* End of the loop  */
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(inputrec->bIMD, inputrec->imd);
+
+    /* Print some data...  */
+    if (MASTER(cr))
+    {
+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
+    }
+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
+                  top_global, inputrec, count,
+                  s_min, state_global);
+
+    if (MASTER(cr))
+    {
+        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
+        fnormn = s_min->fnorm/sqrtNumAtoms;
+
+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    /* To print the actual number of steps we needed somewhere */
+    inputrec->nsteps = count;
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
+
+    return 0;
+}   /* That's all folks */
+
+/*! \brief Do normal modes analysis
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double do_nm(FILE *fplog, t_commrec *cr,
+             int nfile, const t_filenm fnm[],
+             const gmx_output_env_t gmx_unused *oenv, gmx_bool bVerbose,
+             int gmx_unused nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int gmx_unused stepout,
+             t_inputrec *inputrec,
+             gmx_mtop_t *top_global, t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t  gmx_unused ed,
+             t_forcerec *fr,
+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
+             real gmx_unused cpt_period, real gmx_unused max_hours,
+             int imdport,
+             unsigned long gmx_unused Flags,
+             gmx_walltime_accounting_t walltime_accounting)
+{
+    const char          *NM = "Normal Mode Analysis";
+    gmx_mdoutf_t         outf;
+    int                  nnodes, node;
+    gmx_localtop_t      *top;
+    gmx_enerdata_t      *enerd;
+    rvec                *f;
+    gmx_global_stat_t    gstat;
+    t_graph             *graph;
+    tensor               vir, pres;
+    rvec                 mu_tot;
+    rvec                *fneg, *dfdx;
+    gmx_bool             bSparse; /* use sparse matrix storage format */
+    size_t               sz;
+    gmx_sparsematrix_t * sparse_matrix           = NULL;
+    real           *     full_matrix             = NULL;
+    em_state_t       *   state_work;
+
+    /* added with respect to mdrun */
+    int                       row, col;
+    real                      der_range = 10.0*sqrt(GMX_REAL_EPS);
+    real                      x_min;
+    bool                      bIsMaster = MASTER(cr);
+
+    if (constr != NULL)
+    {
+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
+    }
+
+    state_work = init_em_state();
+
+    /* Init em and store the local state in state_minimum */
+    init_em(fplog, NM, cr, inputrec,
+            state_global, top_global, state_work, &top,
+            &f,
+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
+
+    gmx_shellfc_t *shellfc = init_shell_flexcon(stdout,
+                                                top_global,
+                                                n_flexible_constraints(constr),
+                                                inputrec->nstcalcenergy,
+                                                DOMAINDECOMP(cr));
+
+    if (shellfc)
+    {
+        make_local_shells(cr, mdatoms, shellfc);
+    }
+    std::vector<size_t> atom_index = get_atom_index(top_global);
+    snew(fneg, atom_index.size());
+    snew(dfdx, atom_index.size());
+
+#if !GMX_DOUBLE
+    if (bIsMaster)
+    {
+        fprintf(stderr,
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
+                "      are fairly modest even if you recompile in double precision.\n\n");
+    }
+#endif
+
+    /* Check if we can/should use sparse storage format.
+     *
+     * Sparse format is only useful when the Hessian itself is sparse, which it
+     * will be when we use a cutoff.
+     * For small systems (n<1000) it is easier to always use full matrix format, though.
+     */
+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
+    {
+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
+        bSparse = FALSE;
+    }
+    else if (atom_index.size() < 1000)
+    {
+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", atom_index.size());
+        bSparse = FALSE;
+    }
+    else
+    {
+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
+        bSparse = TRUE;
+    }
+
+    /* Number of dimensions, based on real atoms, that is not vsites or shell */
+    sz = DIM*atom_index.size();
+
+    fprintf(stderr, "Allocating Hessian memory...\n\n");
+
+    if (bSparse)
+    {
+        sparse_matrix = gmx_sparsematrix_init(sz);
+        sparse_matrix->compressed_symmetric = TRUE;
+    }
+    else
+    {
+        snew(full_matrix, sz*sz);
+    }
+
+    init_nrnb(nrnb);
+
+    where();
+
+    /* Write start time and temperature */
+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
+
+    /* fudge nr of steps to nr of atoms */
+    inputrec->nsteps = atom_index.size()*2;
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
+                *(top_global->name), (int)inputrec->nsteps);
+    }
+
+    nnodes = cr->nnodes;
+
+    /* Make evaluate_energy do a single node force calculation */
+    cr->nnodes = 1;
+    evaluate_energy(fplog, cr,
+                    top_global, state_work, top,
+                    inputrec, nrnb, wcycle, gstat,
+                    vsite, constr, fcd, graph, mdatoms, fr,
+                    mu_tot, enerd, vir, pres, -1, TRUE);
+    cr->nnodes = nnodes;
+
+    /* if forces are not small, warn user */
+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
+
+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
+    if (state_work->fmax > 1.0e-3)
+    {
+        md_print_info(cr, fplog,
+                      "The force is probably not small enough to "
+                      "ensure that you are at a minimum.\n"
+                      "Be aware that negative eigenvalues may occur\n"
+                      "when the resulting matrix is diagonalized.\n\n");
+    }
+
+    /***********************************************************
+     *
+     *      Loop over all pairs in matrix
+     *
+     *      do_force called twice. Once with positive and
+     *      once with negative displacement
+     *
+     ************************************************************/
+
+    /* Steps are divided one by one over the nodes */
+    bool bNS = true;
+    for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes)
+    {
+        size_t atom = atom_index[aid];
+        for (size_t d = 0; d < DIM; d++)
+        {
+            gmx_bool    bBornRadii  = FALSE;
+            gmx_int64_t step        = 0;
+            int         force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
+            double      t           = 0;
+
+            x_min = state_work->s.x[atom][d];
+
+            for (unsigned int dx = 0; (dx < 2); dx++)
+            {
+                if (dx == 0)
+                {
+                    state_work->s.x[atom][d] = x_min - der_range;
+                }
+                else
+                {
+                    state_work->s.x[atom][d] = x_min + der_range;
+                }
+
+                /* Make evaluate_energy do a single node force calculation */
+                cr->nnodes = 1;
+                if (shellfc)
+                {
+                    /* Now is the time to relax the shells */
+                    (void) relax_shell_flexcon(fplog, cr, bVerbose, step,
+                                               inputrec, bNS, force_flags,
+                                               top,
+                                               constr, enerd, fcd,
+                                               &state_work->s, state_work->f, vir, mdatoms,
+                                               nrnb, wcycle, graph, &top_global->groups,
+                                               shellfc, fr, bBornRadii, t, mu_tot,
+                                               vsite, NULL);
+                    bNS = false;
+                    step++;
+                }
+                else
+                {
+                    evaluate_energy(fplog, cr,
+                                    top_global, state_work, top,
+                                    inputrec, nrnb, wcycle, gstat,
+                                    vsite, constr, fcd, graph, mdatoms, fr,
+                                    mu_tot, enerd, vir, pres, atom*2+dx, FALSE);
+                }
+
+                cr->nnodes = nnodes;
+
+                if (dx == 0)
+                {
+                    for (size_t i = 0; i < atom_index.size(); i++)
+                    {
+                        copy_rvec(state_work->f[atom_index[i]], fneg[i]);
+                    }
+                }
+            }
+
+            /* x is restored to original */
+            state_work->s.x[atom][d] = x_min;
+
+            for (size_t j = 0; j < atom_index.size(); j++)
+            {
+                for (size_t k = 0; (k < DIM); k++)
+                {
+                    dfdx[j][k] =
+                        -(state_work->f[atom_index[j]][k] - fneg[j][k])/(2*der_range);
+                }
+            }
+
+            if (!bIsMaster)
+            {
+#if GMX_MPI
+#define mpi_type GMX_MPI_REAL
+                MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr),
+                         cr->nodeid, cr->mpi_comm_mygroup);
+#endif
+            }
+            else
+            {
+                for (node = 0; (node < nnodes && atom+node < atom_index.size()); node++)
+                {
+                    if (node > 0)
+                    {
+#if GMX_MPI
+                        MPI_Status stat;
+                        MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node,
+                                 cr->mpi_comm_mygroup, &stat);
+#undef mpi_type
+#endif
+                    }
+
+                    row = (atom + node)*DIM + d;
+
+                    for (size_t j = 0; j < atom_index.size(); j++)
+                    {
+                        for (size_t k = 0; k < DIM; k++)
+                        {
+                            col = j*DIM + k;
+
+                            if (bSparse)
+                            {
+                                if (col >= row && dfdx[j][k] != 0.0)
+                                {
+                                    gmx_sparsematrix_increment_value(sparse_matrix,
+                                                                     row, col, dfdx[j][k]);
+                                }
+                            }
+                            else
+                            {
+                                full_matrix[row*sz+col] = dfdx[j][k];
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (bVerbose && fplog)
+            {
+                fflush(fplog);
+            }
+        }
+        /* write progress */
+        if (bIsMaster && bVerbose)
+        {
+            fprintf(stderr, "\rFinished step %d out of %d",
+                    static_cast<int>(std::min(atom+nnodes, atom_index.size())),
+                    static_cast<int>(atom_index.size()));
+            fflush(stderr);
+        }
+    }
+
+    if (bIsMaster)
+    {
+        fprintf(stderr, "\n\nWriting Hessian...\n");
+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
+    }
+
+    finish_em(cr, outf, walltime_accounting, wcycle);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2);
+
+    return 0;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bd872565f10b34eaa07f80101ae3691a30be7ca
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp
@@ -0,0 +1,1940 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "md.h"
+
+#include "config.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "thread_mpi/threads.h"
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdrun_signalling.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+/*! \brief Check whether bonded interactions are missing, if appropriate
+ *
+ * \param[in]    fplog                                  Log file pointer
+ * \param[in]    cr                                     Communication object
+ * \param[in]    totalNumberOfBondedInteractions        Result of the global reduction over the number of bonds treated in each domain
+ * \param[in]    top_global                             Global topology for the error message
+ * \param[in]    top_local                              Local topology for the error message
+ * \param[in]    state                                  Global state for the error message
+ * \param[inout] shouldCheckNumberOfBondedInteractions  Whether we should do the check.
+ *
+ * \return Nothing, except that shouldCheckNumberOfBondedInteractions
+ * is always set to false after exit.
+ */
+static void checkNumberOfBondedInteractions(FILE *fplog, t_commrec *cr, int totalNumberOfBondedInteractions,
+                                            gmx_mtop_t *top_global, gmx_localtop_t *top_local, t_state *state,
+                                            bool *shouldCheckNumberOfBondedInteractions)
+{
+    if (*shouldCheckNumberOfBondedInteractions)
+    {
+        if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
+        {
+            dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
+        }
+        *shouldCheckNumberOfBondedInteractions = false;
+    }
+}
+
+static void reset_all_counters(FILE *fplog, t_commrec *cr,
+                               gmx_int64_t step,
+                               gmx_int64_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_walltime_accounting_t walltime_accounting,
+                               struct nonbonded_verlet_t *nbv)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+                  gmx_step_str(step, sbuf));
+
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+        resetGpuProfiler();
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    walltime_accounting_start(walltime_accounting);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
+}
+
+/*! \libinternal
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double gmx::do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+                  const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                  int nstglobalcomm,
+                  gmx_vsite_t *vsite, gmx_constr_t constr,
+                  int stepout, t_inputrec *ir,
+                  gmx_mtop_t *top_global,
+                  t_fcdata *fcd,
+                  t_state *state_global,
+                  t_mdatoms *mdatoms,
+                  t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                  gmx_edsam_t ed, t_forcerec *fr,
+                  int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                  real cpt_period, real max_hours,
+                  int imdport,
+                  unsigned long Flags,
+                  gmx_walltime_accounting_t walltime_accounting)
+{
+    gmx_mdoutf_t    outf = NULL;
+    gmx_int64_t     step, step_rel;
+    double          elapsed_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD,
+                    bFirstStep, startingFromCheckpoint, bInitStep, bLastStep = FALSE,
+                    bBornRadii;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    matrix            pcoupl_mu, M;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = NULL;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin   = NULL;
+    t_state          *state    = NULL;
+    gmx_enerdata_t   *enerd;
+    rvec             *f = NULL;
+    gmx_global_stat_t gstat;
+    gmx_update_t     *upd   = NULL;
+    t_graph          *graph = NULL;
+    gmx_signalling_t  gs;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind;
+    gmx_shellfc_t    *shellfc;
+    gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bTemp, bPres, bTrotter;
+    real              dvdl_constr;
+    rvec             *cbuf        = NULL;
+    int               cbuf_nalloc = 0;
+    matrix            lastbox;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep = 0;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+    gmx_int64_t       multisim_nsteps        = -1;                 /* number of steps to do  before first multisim
+                                                                          simulation stops. If equal to zero, don't
+                                                                          communicate any more between multisims.*/
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = NULL;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+    gmx_membed_t     *membed   = NULL;
+
+    /* PLUMED */
+    int plumedNeedsEnergy=0;
+    int plumedWantsToStop=0;
+    matrix plumed_vir;
+    /* END PLUMED */
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool shouldCheckNumberOfBondedInteractions = false;
+    int  totalNumberOfBondedInteractions       = -1;
+
+    /* Check for special mdrun options */
+    bRerunMD = (Flags & MD_RERUN);
+    if (Flags & MD_RESETCOUNTERSHALFWAY)
+    {
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (max_hours > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (bRerunMD)
+    {
+        ir->nstxout_compressed = 0;
+    }
+    groups = &top_global->groups;
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, nfile, fnm, top_global, ir, state_global, cr, &cpt_period);
+    }
+
+    if (ir->eSwapCoords != eswapNO)
+    {
+        /* Initialize ion swapping code */
+        init_swapcoords(fplog, bVerbose, ir, opt2fn_master("-swap", nfile, fnm, cr),
+                        top_global, state_global->x, state_global->box, &state_global->swapstate, cr, oenv, Flags);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+            &(state_global->fep_state), lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+    if (DOMAINDECOMP(cr))
+    {
+        f = NULL;
+    }
+    else
+    {
+        snew(f, top_global->natoms);
+    }
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
+    }
+
+    if (inputrecDeform(ir))
+    {
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        snew(state, 1);
+        dd_init_local_state(cr->dd, state_global, state);
+    }
+    else
+    {
+        top = gmx_mtop_generate_local_top(top_global, ir->efep != efepNO);
+
+        forcerec_set_excl_load(fr, top);
+
+        state    = serial_init_local_state(state_global);
+
+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, top, mdatoms, cr);
+        }
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+
+        if (shellfc)
+        {
+            make_local_shells(cr, mdatoms, shellfc);
+        }
+
+        setup_bonded_threading(fr, &top->idef);
+
+        update_realloc(upd, state->nalloc);
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, state_global->x,
+             nfile, fnm, oenv, imdport, Flags);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdatoms, top, fr,
+                            vsite, constr,
+                            nrnb, NULL, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->nalloc);
+    }
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    startingFromCheckpoint = Flags & MD_STARTFROMCPT;
+
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(startingFromCheckpoint, ir, &state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (Flags & MD_APPENDFILES)
+            {
+                restore_energyhistory_from_state(mdebin, state_global->enerhist);
+            }
+            else
+            {
+                /* We might have read an energy history from checkpoint,
+                 * free the allocated memory and reset the counts.
+                 */
+                done_energyhistory(state_global->enerhist);
+                init_energyhistory(state_global->enerhist);
+            }
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(state_global->enerhist, mdebin);
+    }
+
+    /* Initialize constraints */
+    if (constr && !DOMAINDECOMP(cr))
+    {
+        set_constraints(constr, top, ir, mdatoms, cr);
+    }
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+    }
+
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
+     */
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
+                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+        {
+            /* Set the velocities of frozen particles to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                    {
+                        state->v[i][m] = 0;
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state,
+                               cr, nrnb, fr, top);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x, ir->delta_t, NULL,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (repl_ex_nst > 0)
+        {
+            nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (Flags & MD_READ_EKIN)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (bStopCM ? CGLO_STOPCM : 0)
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                    constr, NULL, FALSE, state->box,
+                    &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags
+                    | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, NULL, FALSE, state->box,
+                        NULL, &bSumEkinhOld,
+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!(Flags & MD_STARTFROMCPT))
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+    if (ir->eI != eiVV)
+    {
+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+                                     and there is no previous step */
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+        {
+            fprintf(fplog,
+                    "RMS relative constraint deviation after constraining: %.2e\n",
+                    constr_rmsd(constr));
+        }
+        if (EI_STATE_VELOCITY(ir->eI))
+        {
+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+        }
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (bVerbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      /* detect plumed API version */
+      int pversion=0;
+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
+      /* setting kbT is only implemented with api>1) */
+      real kbT=ir->opts.ref_t[0]*BOLTZ;
+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
+      if(pversion>2){
+        int res=1;
+        if( (Flags & MD_STARTFROMCPT) ) plumed_cmd(plumedmain,"setRestart",&res);
+      }
+
+      if(cr->ms && cr->ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+        }
+      }
+    }
+    /* END PLUMED */
+
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bLastStep = !read_first_frame(oenv, &status,
+                                          opt2fn("-rerun", nfile, fnm),
+                                          &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < gmx::square(fr->rlist))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* loop over MD steps or if rerunMD to end of input trajectory */
+    bFirstStep = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    init_global_signals(&gs, cr, ir, repl_ex_nst);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+    {
+        /* check how many steps are left in other sims */
+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+    }
+
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+
+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+                     do_per_step(step, repl_ex_nst));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        if (bRerunMD)
+        {
+            if (!DOMAINDECOMP(cr) || MASTER(cr))
+            {
+                for (i = 0; i < state_global->natoms; i++)
+                {
+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
+                }
+                if (rerun_fr.bV)
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        clear_rvec(state_global->v[i]);
+                    }
+                    if (bRerunWarnNoV)
+                    {
+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                                "         Ekin, temperature and pressure are incorrect,\n"
+                                "         the virial will be incorrect when constraints are present.\n"
+                                "\n");
+                        bRerunWarnNoV = FALSE;
+                    }
+                }
+            }
+            copy_mat(rerun_fr.box, state_global->box);
+            copy_mat(state_global->box, state->box);
+
+            if (vsite && (Flags & MD_RERUN_VSITE))
+            {
+                if (DOMAINDECOMP(cr))
+                {
+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
+                }
+                if (graph)
+                {
+                    /* Following is necessary because the graph may get out of sync
+                     * with the coordinates if we only have every N'th coordinate set
+                     */
+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
+                if (graph)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+            }
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+            bNStList = bNS;
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching */
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+        }
+
+        /* check whether we should stop because another simulation has
+           stopped. */
+        if (MULTISIM(cr))
+        {
+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
+                 (multisim_nsteps != ir->nsteps) )
+            {
+                if (bNS)
+                {
+                    if (MASTER(cr))
+                    {
+                        fprintf(stderr,
+                                "Stopping simulation %d because another one has finished\n",
+                                cr->ms->sim);
+                    }
+                    bLastStep         = TRUE;
+                    gs.sig[eglsCHKPT] = 1;
+                }
+            }
+        }
+
+        /* < 0 means stop after this step, > 0 means stop at next NS step */
+        if ( (gs.set[eglsSTOPCOND] < 0) ||
+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep || bRerunMD;
+        do_verbose = bVerbose &&
+            (step % stepout == 0 || bFirstStep || bLastStep || bRerunMD);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (inputrecDynamicBox(ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->nalloc);
+
+                /* PLUMED */
+                if(plumedswitch){
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+                }
+                /* END PLUMED */
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                            constr, NULL, FALSE, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && (Flags & MD_CONFOUT))) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            gs.set[eglsCHKPT] = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            /* TODO: This is probably not what we want, we will write to energy file one step after nstcalcenergy steps. */
+            bCalcEnerStep = do_per_step(step - 1, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep || bRerunMD);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, bVerbose, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f, force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, bBornRadii, t, mu_tot,
+                                vsite, mdoutf_get_fp_field(outf));
+        }
+        else
+        {
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+
+            /* PLUMED */
+            plumedNeedsEnergy=0;
+            if(plumedswitch){
+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
+              plumed_cmd(plumedmain,"prepareCalc",NULL);
+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+              clear_mat(plumed_vir);
+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
+            }
+            /* END PLUMED */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
+            /* PLUMED */
+            if(plumedswitch){
+              if(plumedNeedsEnergy){
+                msmul(force_vir,2.0,plumed_vir);
+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+                plumed_cmd(plumedmain,"performCalc",NULL);
+                msmul(plumed_vir,0.5,force_vir);
+              } else {
+                msmul(plumed_vir,0.5,plumed_vir);
+                m_add(force_vir,plumed_vir,force_vir);
+              }
+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
+            }
+            /* END PLUMED */
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = NULL;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v, vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, state->x);
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    copy_mat(shake_vir, state->svir_prev);
+                    copy_mat(force_vir, state->fvir_prev);
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                    constr, NULL, FALSE, state->box,
+                                    NULL, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v, 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO && !bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            copy_df_history(&state_global->dfhist, &state->dfhist);
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 &nchkpt,
+                                 bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x, ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && bTrotter)
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#if GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            int nsteps_stop = -1;
+
+            /* this is just make gs.sig compatible with the hack
+               of sending signals around by MPI_Reduce with together with
+               other floats */
+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+            {
+                gs.sig[eglsSTOPCOND] = 1;
+                nsteps_stop          = std::max(ir->nstlist, 2*nstglobalcomm);
+            }
+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                gs.sig[eglsSTOPCOND] = -1;
+                nsteps_stop          = nstglobalcomm + 1;
+            }
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                        gmx_get_signal_name(), nsteps_stop);
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                    gmx_get_signal_name(), nsteps_stop);
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+        {
+            /* Signal to terminate the run */
+            gs.sig[eglsSTOPCOND] = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            elapsed_time > max_hours*60.0*60.0*0.495)
+        {
+            /* Set flag that will communicate the signal to all ranks in the simulation */
+            gs.sig[eglsRESETCOUNTERS] = 1;
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
+            gs.set[eglsCHKPT] == 0)
+        {
+            gs.sig[eglsCHKPT] = 1;
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+        {
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+            }
+
+            if (EI_VV(ir->eI))
+            {
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtVELOCITY2,
+                              cr, constr);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
+
+            if (ir->eI == eiVVAK)
+            {
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
+                {
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
+                }
+                copy_rvecn(state->x, cbuf, 0, state->natoms);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, lastbox,
+                                NULL, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, state->x, 0, state->natoms);
+
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtPOSITION, cr, constr);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, NULL,
+                                   FALSE, bCalcVir);
+            }
+            if (EI_VV(ir->eI))
+            {
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+            }
+            else
+            {
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+            }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, state->x);
+        }
+
+        if (vsite != NULL)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != NULL)
+            {
+                shift_self(graph, state->box, state->x);
+            }
+            construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != NULL)
+            {
+                unshift_self(graph, state->box, state->x);
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        {
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &gs,
+                            (step_rel % gs.nstms == 0) &&
+                            (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                            lastbox,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                            | CGLO_CONSTRAINT
+                            | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                            );
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && (!EI_VV(ir->eI) || bRerunMD))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+        update_box(fplog, step, ir, mdatoms, state, f,
+                   pcoupl_mu, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+        /* use the directly determined last velocity, not actually the averaged half steps */
+        if (bTrotter && ir->eI == eiVV)
+        {
+            enerd->term[F_EKIN] = last_ekin;
+        }
+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+        if (EI_VV(ir->eI))
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+        }
+        else
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+        }
+        /* #########  END PREPARING EDR OUTPUT  ###########  */
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : NULL,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts));
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             bRerunMD ? rerun_fr.x   : state->x,
+                                             bRerunMD ? rerun_fr.box : state->box,
+                                             top_global, MASTER(cr) && bVerbose, bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdatoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->nalloc);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = FALSE;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != NULL) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, state_global->x);
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bLastStep = !read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+            }
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        /* TODO make a counter-reset module */
+        /* If it is time to reset counters, set a flag that remains
+           true until counters actually get reset */
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            gs.set[eglsRESETCOUNTERS] != 0)
+        {
+            if (pme_loadbal_is_active(pme_loadbal))
+            {
+                /* Do not permit counter reset while PME load
+                 * balancing is active. The only purpose for resetting
+                 * counters is to measure reliable performance data,
+                 * and that can't be done before balancing
+                 * completes.
+                 *
+                 * TODO consider fixing this by delaying the reset
+                 * until after load balancing completes,
+                 * e.g. https://gerrit.gromacs.org/#/c/4964/2 */
+                gmx_fatal(FARGS, "PME tuning was still active when attempting to "
+                          "reset mdrun counters at step %" GMX_PRId64 ". Try "
+                          "resetting counters later in the run, e.g. with gmx "
+                          "mdrun -resetstep.", step);
+            }
+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
+                               use_GPU(fr->nbv) ? fr->nbv : NULL);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!(cr->duty & DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= elapsed_time/(60.0*60.0);
+            /* If mdrun -maxh -resethway was active, it can only trigger once */
+            bResetCountersHalfMaxH    = FALSE; /* TODO move this to where gs.sig[eglsRESETCOUNTERS] is set */
+            /* Reset can only happen once, so clear the triggering flag. */
+            gs.set[eglsRESETCOUNTERS] = 0;
+        }
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end(walltime_accounting);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trj(status);
+    }
+
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts));
+        }
+    }
+
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    if (membed != nullptr)
+    {
+        free_membed(membed);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    return 0;
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..92fa911eb42e08119a3cb688cadea70fab76148f
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/md.cpp.preplumed
@@ -0,0 +1,1841 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "md.h"
+
+#include "config.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "thread_mpi/threads.h"
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdebin.h"
+#include "gromacs/mdlib/mdoutf.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/mdrun_signalling.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/ns.h"
+#include "gromacs/mdlib/shellfc.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tgroup.h"
+#include "gromacs/mdlib/trajectory_writing.h"
+#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vcm.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/energyhistory.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+/*! \brief Check whether bonded interactions are missing, if appropriate
+ *
+ * \param[in]    fplog                                  Log file pointer
+ * \param[in]    cr                                     Communication object
+ * \param[in]    totalNumberOfBondedInteractions        Result of the global reduction over the number of bonds treated in each domain
+ * \param[in]    top_global                             Global topology for the error message
+ * \param[in]    top_local                              Local topology for the error message
+ * \param[in]    state                                  Global state for the error message
+ * \param[inout] shouldCheckNumberOfBondedInteractions  Whether we should do the check.
+ *
+ * \return Nothing, except that shouldCheckNumberOfBondedInteractions
+ * is always set to false after exit.
+ */
+static void checkNumberOfBondedInteractions(FILE *fplog, t_commrec *cr, int totalNumberOfBondedInteractions,
+                                            gmx_mtop_t *top_global, gmx_localtop_t *top_local, t_state *state,
+                                            bool *shouldCheckNumberOfBondedInteractions)
+{
+    if (*shouldCheckNumberOfBondedInteractions)
+    {
+        if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
+        {
+            dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
+        }
+        *shouldCheckNumberOfBondedInteractions = false;
+    }
+}
+
+static void reset_all_counters(FILE *fplog, t_commrec *cr,
+                               gmx_int64_t step,
+                               gmx_int64_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_walltime_accounting_t walltime_accounting,
+                               struct nonbonded_verlet_t *nbv)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+                  gmx_step_str(step, sbuf));
+
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+        resetGpuProfiler();
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    walltime_accounting_start(walltime_accounting);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
+}
+
+/*! \libinternal
+    \copydoc integrator_t (FILE *fplog, t_commrec *cr,
+                           int nfile, const t_filenm fnm[],
+                           const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                           int nstglobalcomm,
+                           gmx_vsite_t *vsite, gmx_constr_t constr,
+                           int stepout,
+                           t_inputrec *inputrec,
+                           gmx_mtop_t *top_global, t_fcdata *fcd,
+                           t_state *state_global,
+                           t_mdatoms *mdatoms,
+                           t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                           gmx_edsam_t ed,
+                           t_forcerec *fr,
+                           int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                           real cpt_period, real max_hours,
+                           int imdport,
+                           unsigned long Flags,
+                           gmx_walltime_accounting_t walltime_accounting)
+ */
+double gmx::do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+                  const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                  int nstglobalcomm,
+                  gmx_vsite_t *vsite, gmx_constr_t constr,
+                  int stepout, t_inputrec *ir,
+                  gmx_mtop_t *top_global,
+                  t_fcdata *fcd,
+                  t_state *state_global,
+                  t_mdatoms *mdatoms,
+                  t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                  gmx_edsam_t ed, t_forcerec *fr,
+                  int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                  real cpt_period, real max_hours,
+                  int imdport,
+                  unsigned long Flags,
+                  gmx_walltime_accounting_t walltime_accounting)
+{
+    gmx_mdoutf_t    outf = NULL;
+    gmx_int64_t     step, step_rel;
+    double          elapsed_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD,
+                    bFirstStep, startingFromCheckpoint, bInitStep, bLastStep = FALSE,
+                    bBornRadii;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    matrix            pcoupl_mu, M;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = NULL;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin   = NULL;
+    t_state          *state    = NULL;
+    gmx_enerdata_t   *enerd;
+    rvec             *f = NULL;
+    gmx_global_stat_t gstat;
+    gmx_update_t     *upd   = NULL;
+    t_graph          *graph = NULL;
+    gmx_signalling_t  gs;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind;
+    gmx_shellfc_t    *shellfc;
+    gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bTemp, bPres, bTrotter;
+    real              dvdl_constr;
+    rvec             *cbuf        = NULL;
+    int               cbuf_nalloc = 0;
+    matrix            lastbox;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep = 0;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+    gmx_int64_t       multisim_nsteps        = -1;                 /* number of steps to do  before first multisim
+                                                                          simulation stops. If equal to zero, don't
+                                                                          communicate any more between multisims.*/
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t *pme_loadbal      = NULL;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
+
+    /* Interactive MD */
+    gmx_bool          bIMDstep = FALSE;
+    gmx_membed_t     *membed   = NULL;
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+    /* Domain decomposition could incorrectly miss a bonded
+       interaction, but checking for that requires a global
+       communication stage, which does not otherwise happen in DD
+       code. So we do that alongside the first global energy reduction
+       after a new DD is made. These variables handle whether the
+       check happens, and the result it returns. */
+    bool shouldCheckNumberOfBondedInteractions = false;
+    int  totalNumberOfBondedInteractions       = -1;
+
+    /* Check for special mdrun options */
+    bRerunMD = (Flags & MD_RERUN);
+    if (Flags & MD_RESETCOUNTERSHALFWAY)
+    {
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (max_hours > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (bRerunMD)
+    {
+        ir->nstxout_compressed = 0;
+    }
+    groups = &top_global->groups;
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        /* Note that membed cannot work in parallel because mtop is
+         * changed here. Fix this if we ever want to make it run with
+         * multiple ranks. */
+        membed = init_membed(fplog, nfile, fnm, top_global, ir, state_global, cr, &cpt_period);
+    }
+
+    if (ir->eSwapCoords != eswapNO)
+    {
+        /* Initialize ion swapping code */
+        init_swapcoords(fplog, bVerbose, ir, opt2fn_master("-swap", nfile, fnm, cr),
+                        top_global, state_global->x, state_global->box, &state_global->swapstate, cr, oenv, Flags);
+    }
+
+    /* Initial values */
+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+            &(state_global->fep_state), lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+    if (DOMAINDECOMP(cr))
+    {
+        f = NULL;
+    }
+    else
+    {
+        snew(f, top_global->natoms);
+    }
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 ir->nstcalcenergy, DOMAINDECOMP(cr));
+
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
+    }
+
+    if (inputrecDeform(ir))
+    {
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        snew(state, 1);
+        dd_init_local_state(cr->dd, state_global, state);
+    }
+    else
+    {
+        top = gmx_mtop_generate_local_top(top_global, ir->efep != efepNO);
+
+        forcerec_set_excl_load(fr, top);
+
+        state    = serial_init_local_state(state_global);
+
+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, top, mdatoms, cr);
+        }
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+
+        if (shellfc)
+        {
+            make_local_shells(cr, mdatoms, shellfc);
+        }
+
+        setup_bonded_threading(fr, &top->idef);
+
+        update_realloc(upd, state->nalloc);
+    }
+
+    /* Set up interactive MD (IMD) */
+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, state_global->x,
+             nfile, fnm, oenv, imdport, Flags);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdatoms, top, fr,
+                            vsite, constr,
+                            nrnb, NULL, FALSE);
+        shouldCheckNumberOfBondedInteractions = true;
+        update_realloc(upd, state->nalloc);
+    }
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    startingFromCheckpoint = Flags & MD_STARTFROMCPT;
+
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(startingFromCheckpoint, ir, &state->dfhist);
+    }
+
+    if (MASTER(cr))
+    {
+        if (startingFromCheckpoint)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (Flags & MD_APPENDFILES)
+            {
+                restore_energyhistory_from_state(mdebin, state_global->enerhist);
+            }
+            else
+            {
+                /* We might have read an energy history from checkpoint,
+                 * free the allocated memory and reset the counts.
+                 */
+                done_energyhistory(state_global->enerhist);
+                init_energyhistory(state_global->enerhist);
+            }
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(state_global->enerhist, mdebin);
+    }
+
+    /* Initialize constraints */
+    if (constr && !DOMAINDECOMP(cr))
+    {
+        set_constraints(constr, top, ir, mdatoms, cr);
+    }
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+    }
+
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
+     */
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
+    {
+        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
+                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+        {
+            /* Set the velocities of frozen particles to zero */
+            for (i = 0; i < mdatoms->homenr; i++)
+            {
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                    {
+                        state->v[i][m] = 0;
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state,
+                               cr, nrnb, fr, top);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(vsite, state->x, ir->delta_t, NULL,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+        }
+    }
+
+    if (ir->efep != efepNO)
+    {
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst. */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
+        }
+        if (repl_ex_nst > 0)
+        {
+            nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        }
+    }
+
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
+    if (Flags & MD_READ_EKIN)
+    {
+        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
+    }
+
+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+                  | (bStopCM ? CGLO_STOPCM : 0)
+                  | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
+                  | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                    constr, NULL, FALSE, state->box,
+                    &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags
+                    | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
+    checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                    top_global, top, state,
+                                    &shouldCheckNumberOfBondedInteractions);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, NULL, FALSE, state->box,
+                        NULL, &bSumEkinhOld,
+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!(Flags & MD_STARTFROMCPT))
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+    if (ir->eI != eiVV)
+    {
+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+                                     and there is no previous step */
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+        {
+            fprintf(fplog,
+                    "RMS relative constraint deviation after constraining: %.2e\n",
+                    constr_rmsd(constr));
+        }
+        if (EI_STATE_VELOCITY(ir->eI))
+        {
+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+        }
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (bVerbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    walltime_accounting_start(walltime_accounting);
+    wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, walltime_accounting, "mdrun");
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bLastStep = !read_first_frame(oenv, &status,
+                                          opt2fn("-rerun", nfile, fnm),
+                                          &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < gmx::square(fr->rlist))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* loop over MD steps or if rerunMD to end of input trajectory */
+    bFirstStep = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+    bNeedRepartition = FALSE;
+
+    init_global_signals(&gs, cr, ir, repl_ex_nst);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+    {
+        /* check how many steps are left in other sims */
+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+    }
+
+
+    /* and stop now if we should */
+    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+    while (!bLastStep)
+    {
+
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state,
+                           wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+
+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!startingFromCheckpoint));
+        }
+
+        bDoReplEx = ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+                     do_per_step(step, repl_ex_nst));
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(ir, t, upd);
+        }
+
+        if (bRerunMD)
+        {
+            if (!DOMAINDECOMP(cr) || MASTER(cr))
+            {
+                for (i = 0; i < state_global->natoms; i++)
+                {
+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
+                }
+                if (rerun_fr.bV)
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        clear_rvec(state_global->v[i]);
+                    }
+                    if (bRerunWarnNoV)
+                    {
+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                                "         Ekin, temperature and pressure are incorrect,\n"
+                                "         the virial will be incorrect when constraints are present.\n"
+                                "\n");
+                        bRerunWarnNoV = FALSE;
+                    }
+                }
+            }
+            copy_mat(rerun_fr.box, state_global->box);
+            copy_mat(state_global->box, state->box);
+
+            if (vsite && (Flags & MD_RERUN_VSITE))
+            {
+                if (DOMAINDECOMP(cr))
+                {
+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
+                }
+                if (graph)
+                {
+                    /* Following is necessary because the graph may get out of sync
+                     * with the coordinates if we only have every N'th coordinate set
+                     */
+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
+                if (graph)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+            }
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+            bNStList = bNS;
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching */
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
+        }
+
+        /* check whether we should stop because another simulation has
+           stopped. */
+        if (MULTISIM(cr))
+        {
+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
+                 (multisim_nsteps != ir->nsteps) )
+            {
+                if (bNS)
+                {
+                    if (MASTER(cr))
+                    {
+                        fprintf(stderr,
+                                "Stopping simulation %d because another one has finished\n",
+                                cr->ms->sim);
+                    }
+                    bLastStep         = TRUE;
+                    gs.sig[eglsCHKPT] = 1;
+                }
+            }
+        }
+
+        /* < 0 means stop after this step, > 0 means stop at next NS step */
+        if ( (gs.set[eglsSTOPCOND] < 0) ||
+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        /* do_log triggers energy and virial calculation. Because this leads
+         * to different code paths, forces can be different. Thus for exact
+         * continuation we should avoid extra log output.
+         * Note that the || bLastStep can result in non-exact continuation
+         * beyond the last step. But we don't consider that to be an issue.
+         */
+        do_log     = do_per_step(step, ir->nstlog) || (bFirstStep && !startingFromCheckpoint) || bLastStep || bRerunMD;
+        do_verbose = bVerbose &&
+            (step % stepout == 0 || bFirstStep || bLastStep || bRerunMD);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (inputrecDynamicBox(ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETunePrinting);
+                shouldCheckNumberOfBondedInteractions = true;
+                update_realloc(upd, state->nalloc);
+            }
+        }
+
+        if (MASTER(cr) && do_log)
+        {
+            print_ebin_header(fplog, step, t); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                            constr, NULL, FALSE, state->box,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+        clear_mat(force_vir);
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && (Flags & MD_CONFOUT))) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            gs.set[eglsCHKPT] = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            /* TODO: This is probably not what we want, we will write to energy file one step after nstcalcenergy steps. */
+            bCalcEnerStep = do_per_step(step - 1, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir      = bCalcEnerStep ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+        bCalcEner = bCalcEnerStep;
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep || bRerunMD);
+
+        if (do_ene || do_log || bDoReplEx)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) ||
+                  (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((inputrecDynamicBox(ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            relax_shell_flexcon(fplog, cr, bVerbose, step,
+                                ir, bNS, force_flags, top,
+                                constr, enerd, fcd,
+                                state, f, force_vir, mdatoms,
+                                nrnb, wcycle, graph, groups,
+                                shellfc, fr, bBornRadii, t, mu_tot,
+                                vsite, mdoutf_get_fp_field(outf));
+        }
+        else
+        {
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
+        }
+
+        if (EI_VV(ir->eI) && !startingFromCheckpoint && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            rvec *vbuf = NULL;
+
+            wallcycle_start(wcycle, ewcUPDATE);
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v, vbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtVELOCITY1,
+                          cr, constr);
+
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, state->x);
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, state->box,
+                                &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0)
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | (bStopCM ? CGLO_STOPCM : 0)
+                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                                top_global, top, state,
+                                                &shouldCheckNumberOfBondedInteractions);
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
+                {
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+
+                    copy_mat(shake_vir, state->svir_prev);
+                    copy_mat(force_vir, state->fvir_prev);
+                    if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
+                    {
+                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                        enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
+                        enerd->term[F_EKIN] = trace(ekind->ekin);
+                    }
+                }
+                else if (bExchanged)
+                {
+                    wallcycle_stop(wcycle, ewcUPDATE);
+                    /* We need the kinetic energy at minus the half step for determining
+                     * the full step kinetic energy and possibly for T-coupling.*/
+                    /* This may not be quite working correctly yet . . . . */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                    constr, NULL, FALSE, state->box,
+                                    NULL, &bSumEkinhOld,
+                                    CGLO_GSTAT | CGLO_TEMPERATURE);
+                    wallcycle_start(wcycle, ewcUPDATE);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (ir->eI == eiVV && bInitStep)
+            {
+                copy_rvecn(vbuf, state->v, 0, state->natoms);
+                sfree(vbuf);
+            }
+            wallcycle_stop(wcycle, ewcUPDATE);
+        }
+
+        /* compute the conserved quantity */
+        if (EI_VV(ir->eI))
+        {
+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (ir->efep != efepNO && !bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            copy_df_history(&state_global->dfhist, &state->dfhist);
+        }
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         */
+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
+                                 ir, state, state_global, top_global, fr,
+                                 outf, mdebin, ekind, f,
+                                 &nchkpt,
+                                 bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
+                                 bSumEkinhOld);
+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x, ir, t, wcycle);
+
+        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
+        if (startingFromCheckpoint && bTrotter)
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+
+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#if GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            int nsteps_stop = -1;
+
+            /* this is just make gs.sig compatible with the hack
+               of sending signals around by MPI_Reduce with together with
+               other floats */
+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+            {
+                gs.sig[eglsSTOPCOND] = 1;
+                nsteps_stop          = std::max(ir->nstlist, 2*nstglobalcomm);
+            }
+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                gs.sig[eglsSTOPCOND] = -1;
+                nsteps_stop          = nstglobalcomm + 1;
+            }
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                        gmx_get_signal_name(), nsteps_stop);
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping within %d steps\n\n",
+                    gmx_get_signal_name(), nsteps_stop);
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+        {
+            /* Signal to terminate the run */
+            gs.sig[eglsSTOPCOND] = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            elapsed_time > max_hours*60.0*60.0*0.495)
+        {
+            /* Set flag that will communicate the signal to all ranks in the simulation */
+            gs.sig[eglsRESETCOUNTERS] = 1;
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
+            gs.set[eglsCHKPT] == 0)
+        {
+            gs.sig[eglsCHKPT] = 1;
+        }
+
+        /* #########   START SECOND UPDATE STEP ################# */
+
+        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
+           in preprocessing */
+
+        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+        {
+            gmx_bool bIfRandomize;
+            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
+            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+            if (constr && bIfRandomize)
+            {
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+            }
+        }
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+        {
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+            }
+
+            if (EI_VV(ir->eI))
+            {
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtVELOCITY2,
+                              cr, constr);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
+
+            if (ir->eI == eiVVAK)
+            {
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
+                {
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
+                }
+                copy_rvecn(state->x, cbuf, 0, state->natoms);
+            }
+
+            update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                          ekind, M, upd, etrtPOSITION, cr, constr);
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, lastbox,
+                                NULL, &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, state->x, 0, state->natoms);
+
+                update_coords(fplog, step, ir, mdatoms, state, f, fcd,
+                              ekind, M, upd, etrtPOSITION, cr, constr);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, NULL,
+                                   FALSE, bCalcVir);
+            }
+            if (EI_VV(ir->eI))
+            {
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+            }
+            else
+            {
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+            }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, state->x);
+        }
+
+        if (vsite != NULL)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != NULL)
+            {
+                shift_self(graph, state->box, state->x);
+            }
+            construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
+
+            if (graph != NULL)
+            {
+                unshift_self(graph, state->box, state->x);
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        {
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &gs,
+                            (step_rel % gs.nstms == 0) &&
+                            (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                            lastbox,
+                            &totalNumberOfBondedInteractions, &bSumEkinhOld,
+                            (bGStat ? CGLO_GSTAT : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                            | CGLO_CONSTRAINT
+                            | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
+                            );
+            checkNumberOfBondedInteractions(fplog, cr, totalNumberOfBondedInteractions,
+                                            top_global, top, state,
+                                            &shouldCheckNumberOfBondedInteractions);
+        }
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && (!EI_VV(ir->eI) || bRerunMD))
+        {
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+        update_box(fplog, step, ir, mdatoms, state, f,
+                   pcoupl_mu, nrnb, upd);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+        /* use the directly determined last velocity, not actually the averaged half steps */
+        if (bTrotter && ir->eI == eiVV)
+        {
+            enerd->term[F_EKIN] = last_ekin;
+        }
+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+        if (EI_VV(ir->eI))
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+        }
+        else
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+        }
+        /* #########  END PREPARING EDR OUTPUT  ###########  */
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
+            }
+            if (bCalcEner)
+            {
+                upd_mdebin(mdebin, bDoDHDL, bCalcEnerStep,
+                           t, mdatoms->tmass, enerd, state,
+                           ir->fepvals, ir->expandedvals, lastbox,
+                           shake_vir, force_vir, total_vir, pres,
+                           ekind, mu_tot, constr);
+            }
+            else
+            {
+                upd_mdebin_step(mdebin);
+            }
+
+            gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
+            gmx_bool do_or  = do_per_step(step, ir->nstorireout);
+
+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : NULL,
+                       step, t,
+                       eprNORMAL, mdebin, fcd, groups, &(ir->opts));
+
+            if (ir->bPull)
+            {
+                pull_print_output(ir->pull_work, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
+            state->fep_state = lamnew;
+        }
+        /* Print the remaining wall clock time for the run */
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, walltime_accounting, step, ir, cr);
+        }
+
+        /* Ion/water position swapping.
+         * Not done in last step since trajectory writing happens before this call
+         * in the MD loop and exchanges would be lost anyway. */
+        bNeedRepartition = FALSE;
+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
+            do_per_step(step, ir->swap->nstswap))
+        {
+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
+                                             bRerunMD ? rerun_fr.x   : state->x,
+                                             bRerunMD ? rerun_fr.box : state->box,
+                                             top_global, MASTER(cr) && bVerbose, bRerunMD);
+
+            if (bNeedRepartition && DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state, state_global);
+            }
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if (bDoReplEx)
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+        }
+
+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
+        {
+            dd_partition_system(fplog, step, cr, TRUE, 1,
+                                state_global, top_global, ir,
+                                state, &f, mdatoms, top, fr,
+                                vsite, constr,
+                                nrnb, wcycle, FALSE);
+            shouldCheckNumberOfBondedInteractions = true;
+            update_realloc(upd, state->nalloc);
+        }
+
+        bFirstStep             = FALSE;
+        bInitStep              = FALSE;
+        startingFromCheckpoint = FALSE;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != NULL) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, state_global->x);
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bLastStep = !read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bLastStep);
+            }
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        /* TODO make a counter-reset module */
+        /* If it is time to reset counters, set a flag that remains
+           true until counters actually get reset */
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            gs.set[eglsRESETCOUNTERS] != 0)
+        {
+            if (pme_loadbal_is_active(pme_loadbal))
+            {
+                /* Do not permit counter reset while PME load
+                 * balancing is active. The only purpose for resetting
+                 * counters is to measure reliable performance data,
+                 * and that can't be done before balancing
+                 * completes.
+                 *
+                 * TODO consider fixing this by delaying the reset
+                 * until after load balancing completes,
+                 * e.g. https://gerrit.gromacs.org/#/c/4964/2 */
+                gmx_fatal(FARGS, "PME tuning was still active when attempting to "
+                          "reset mdrun counters at step %" GMX_PRId64 ". Try "
+                          "resetting counters later in the run, e.g. with gmx "
+                          "mdrun -resetstep.", step);
+            }
+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
+                               use_GPU(fr->nbv) ? fr->nbv : NULL);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!(cr->duty & DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= elapsed_time/(60.0*60.0);
+            /* If mdrun -maxh -resethway was active, it can only trigger once */
+            bResetCountersHalfMaxH    = FALSE; /* TODO move this to where gs.sig[eglsRESETCOUNTERS] is set */
+            /* Reset can only happen once, so clear the triggering flag. */
+            gs.set[eglsRESETCOUNTERS] = 0;
+        }
+
+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
+
+    }
+    /* End of main MD loop */
+
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
+    /* Stop measuring walltime */
+    walltime_accounting_end(walltime_accounting);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trj(status);
+    }
+
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, mdebin, fcd, groups, &(ir->opts));
+        }
+    }
+
+    done_mdoutf(outf);
+
+    if (bPMETune)
+    {
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
+    }
+
+    done_shellfc(fplog, shellfc, step_rel);
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    // Clean up swapcoords
+    if (ir->eSwapCoords != eswapNO)
+    {
+        finish_swapcoords(ir->swap);
+    }
+
+    if (membed != nullptr)
+    {
+        free_membed(membed);
+    }
+
+    /* IMD cleanup, if bIMD is TRUE. */
+    IMD_finalize(ir->bIMD, ir->imd);
+
+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+
+    return 0;
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55f3529faaba83f8ee5e66744004b578c9bce6d5
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp
@@ -0,0 +1,577 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \defgroup module_mdrun Implementation of mdrun
+ * \ingroup group_mdrun
+ *
+ * \brief This module contains code that implements mdrun.
+ */
+/*! \internal \file
+ *
+ * \brief This file implements mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/fileio/readinp.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "mdrun_main.h"
+#include "runner.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+extern void(*plumedcmd)(plumed,const char*,const void*);
+/* END PLUMED */
+
+/*! \brief Return whether either of the command-line parameters that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multi") == 0 || strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+//! Implements C-style main function for mdrun
+int gmx_mdrun(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "[THISMODULE] is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over ranks if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "Running mdrun efficiently in parallel is a complex topic topic,",
+        "many aspects of which are covered in the online User Guide. You",
+        "should look there for practical advice on using many of the options",
+        "available in mdrun.[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [REF].edi[ref]",
+        "file. The [REF].edi[ref] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [REF].xvg[ref] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[REF].mdp[ref] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type the table file name is",
+        "modified in a different way: before the file extension an underscore is",
+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
+        "and finally the table number of the interaction type.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [REF].mdp[ref] file.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
+        "starts from the first step of the [REF].tpr[ref] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]\\*0.99 hours. This option is particularly useful in",
+        "combination with setting [TT]nsteps[tt] to -1 either in the mdp or using the",
+        "similarly named command line option. This results in an infinite run,",
+        "terminated only when the time limit set by [TT]-maxh[tt] is reached (if any)"
+        "or upon receiving a signal."
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM signal, it will stop as soon as",
+        "checkpoint file can be written, i.e. after the next global communication step.",
+        "When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop at the next neighbor search step or at the",
+        "second global communication step, whichever happens later.",
+        "In both cases all the usual output will be written to file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows one to terminate",
+        "the simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
+        "IMD remote can be turned on by [TT]-imdpull[tt].",
+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
+        "pulling is used."
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+    t_commrec    *cr;
+    t_filenm      fnm[] = {
+        { efTPR, NULL,      NULL,       ffREAD },
+        { efTRN, "-o",      NULL,       ffWRITE },
+        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
+        { efCPT, "-cpi",    NULL,       ffOPTRD | ffALLOW_MISSING },
+        { efCPT, "-cpo",    NULL,       ffOPTWR },
+        { efSTO, "-c",      "confout",  ffWRITE },
+        { efEDR, "-e",      "ener",     ffWRITE },
+        { efLOG, "-g",      "md",       ffWRITE },
+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
+        { efXVG, "-field",  "field",    ffOPTWR },
+        { efXVG, "-table",  "table",    ffOPTRD },
+        { efXVG, "-tablep", "tablep",   ffOPTRD },
+        { efXVG, "-tableb", "table",    ffOPTRD },
+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
+        { efEDI, "-ei",     "sam",      ffOPTRD },
+        { efXVG, "-eo",     "edsam",    ffOPTWR },
+        { efXVG, "-devout", "deviatie", ffOPTWR },
+        { efXVG, "-runav",  "runaver",  ffOPTWR },
+        { efXVG, "-px",     "pullx",    ffOPTWR },
+        { efXVG, "-pf",     "pullf",    ffOPTWR },
+        { efXVG, "-ro",     "rotation", ffOPTWR },
+        { efLOG, "-ra",     "rotangles", ffOPTWR },
+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
+        { efLOG, "-rt",     "rottorque", ffOPTWR },
+        { efMTX, "-mtx",    "nm",       ffOPTWR },
+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
+        { efDAT, "-membed", "membed",   ffOPTRD },
+        { efTOP, "-mp",     "membed",   ffOPTRD },
+        { efNDX, "-mn",     "membed",   ffOPTRD },
+        { efXVG, "-if",     "imdforces", ffOPTWR },
+        { efXVG, "-swap",   "swapions", ffOPTWR }
+    };
+    const int     NFILE = asize(fnm);
+
+    /* Command line options ! */
+    gmx_bool          bDDBondCheck  = TRUE;
+    gmx_bool          bDDBondComm   = TRUE;
+    gmx_bool          bTunePME      = TRUE;
+    gmx_bool          bVerbose      = FALSE;
+    gmx_bool          bRerunVSite   = FALSE;
+    gmx_bool          bConfout      = TRUE;
+    gmx_bool          bReproducible = FALSE;
+    gmx_bool          bIMDwait      = FALSE;
+    gmx_bool          bIMDterm      = FALSE;
+    gmx_bool          bIMDpull      = FALSE;
+
+    int               npme          = -1;
+    int               nstlist       = 0;
+    int               nmultisim     = 0;
+    int               nstglobalcomm = -1;
+    int               repl_ex_nst   = 0;
+    int               repl_ex_seed  = -1;
+    int               repl_ex_nex   = 0;
+    int               nstepout      = 100;
+    int               resetstep     = -1;
+    gmx_int64_t       nsteps        = -2;   /* the value -2 means that the mdp option will be used */
+    int               imdport       = 8888; /* can be almost anything, 8888 is easy to remember */
+
+    rvec              realddxyz                   = {0, 0, 0};
+    const char       *ddrank_opt[ddrankorderNR+1] =
+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
+    const char       *dddlb_opt[] =
+    { NULL, "auto", "no", "yes", NULL };
+    const char       *thread_aff_opt[threadaffNR+1] =
+    { NULL, "auto", "on", "off", NULL };
+    const char       *nbpu_opt[] =
+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
+    real              rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
+    char             *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
+    real              cpt_period            = 15.0, max_hours = -1;
+    gmx_bool          bTryToAppendFiles     = TRUE;
+    gmx_bool          bKeepAndNumCPT        = FALSE;
+    gmx_bool          bResetCountersHalfWay = FALSE;
+    gmx_output_env_t *oenv                  = NULL;
+
+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+     * But unfortunately we are not allowed to call a function here,
+     * since declarations follow below.
+     */
+    gmx_hw_opt_t    hw_opt = {
+        0, 0, 0, 0, threadaffSEL, 0, 0,
+        { NULL, FALSE, 0, NULL }
+    };
+
+    t_pargs         pa[] = {
+
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddrank_opt},
+          "DD rank order" },
+        { "-npme",    FALSE, etINT, {&npme},
+          "Number of separate ranks to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI threads to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
+          "Whether mdrun should try to set thread affinities" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The lowest logical core number to which mdrun should pin the first thread" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&rdd},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&rconstr},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&dlb_scale},
+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
+          "HIDDENA string containing a vector of the relative sizes in the x "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
+          "HIDDENA string containing a vector of the relative sizes in the y "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
+          "HIDDENA string containing a vector of the relative sizes in the z "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
+          "Calculate non-bonded interactions on" },
+        { "-nstlist", FALSE, etINT, {&nstlist},
+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
+          "Optimize PME load between PP/PME ranks or GPU/CPU" },
+        { "-v",       FALSE, etBOOL, {&bVerbose},
+          "Be loud and noisy" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&cpt_period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etINT64, {&nsteps},
+          "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+        { "-maxh",   FALSE, etREAL, {&max_hours},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-imdport",    FALSE, etINT, {&imdport},
+          "HIDDENIMD listening port" },
+        { "-imdwait",  FALSE, etBOOL, {&bIMDwait},
+          "HIDDENPause the simulation while no IMD client is connected" },
+        { "-imdterm",  FALSE, etBOOL, {&bIMDterm},
+          "HIDDENAllow termination of the simulation from IMD client" },
+        { "-imdpull",  FALSE, etBOOL, {&bIMDpull},
+          "HIDDENAllow pulling in the simulation from IMD client" },
+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-confout", FALSE, etBOOL, {&bConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&nstepout},
+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
+        { "-resetstep", FALSE, etINT, {&resetstep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    unsigned long   Flags;
+    ivec            ddxyz;
+    int             dd_rank_order;
+    gmx_bool        bDoAppendFiles, bStartFromCpt;
+    FILE           *fplog;
+    int             rc;
+    char          **multidir = NULL;
+
+    cr = init_commrec();
+
+    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multi or -multidir, the file names are going to get processed
+    // further (or the working directory changed), so we can't check for their
+    // existence during parsing.  It isn't useful to do any completion based on
+    // file system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    if (!parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
+                           asize(desc), desc, 0, NULL, &oenv))
+    {
+        return 0;
+    }
+
+
+    dd_rank_order = nenum(ddrank_opt);
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", NFILE, fnm))
+    {
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
+    }
+
+
+    if (repl_ex_nst != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
+    }
+
+    if (repl_ex_nex < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim >= 1)
+    {
+#if !GMX_THREAD_MPI
+        gmx_bool bParFn = (multidir == NULL);
+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
+#else
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
+#endif
+    }
+
+    handleRestart(cr, bTryToAppendFiles, NFILE, fnm,
+                  &bDoAppendFiles, &bStartFromCpt);
+
+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
+    Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES  : 0);
+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
+    Flags = Flags | (bStartFromCpt ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+    Flags = Flags | (opt2parg_bSet("-ntomp", asize(pa), pa) ? MD_NTOMPSET : 0);
+    Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
+    Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
+    Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if (MASTER(cr) && !bDoAppendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
+                     Flags & MD_APPENDFILES, &fplog);
+    }
+    else
+    {
+        fplog = NULL;
+    }
+
+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+
+    /* PLUMED */
+    plumedswitch=0;
+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
+    if(plumedswitch){
+      plumedcmd=plumed_cmd;
+      int real_precision=sizeof(real);
+      real energyUnits=1.0;
+      real lengthUnits=1.0;
+      real timeUnits=1.0;
+  
+      if(!plumed_installed()){
+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
+      }
+      plumedmain=plumed_create();
+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
+      // this is not necessary for gromacs units:
+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
+      //
+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
+      plumedswitch=1;
+    }
+    /* END PLUMED */
+
+    rc = gmx::mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose,
+                       nstglobalcomm, ddxyz, dd_rank_order, npme, rdd, rconstr,
+                       dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
+                       nbpu_opt[0], nstlist,
+                       nsteps, nstepout, resetstep,
+                       nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                       pforce, cpt_period, max_hours, imdport, Flags);
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !bDoAppendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..9458dfbf91803983e33b4ae66f8661057fb2d4db
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/mdrun.cpp.preplumed
@@ -0,0 +1,544 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \defgroup module_mdrun Implementation of mdrun
+ * \ingroup group_mdrun
+ *
+ * \brief This module contains code that implements mdrun.
+ */
+/*! \internal \file
+ *
+ * \brief This file implements mdrun
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \author Erik Lindahl <erik@kth.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_mdrun
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/fileio/readinp.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "mdrun_main.h"
+#include "runner.h"
+
+/*! \brief Return whether either of the command-line parameters that
+ *  will trigger a multi-simulation is set */
+static bool is_multisim_option_set(int argc, const char *const argv[])
+{
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-multi") == 0 || strcmp(argv[i], "-multidir") == 0)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+//! Implements C-style main function for mdrun
+int gmx_mdrun(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "[THISMODULE] is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over ranks if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "Running mdrun efficiently in parallel is a complex topic topic,",
+        "many aspects of which are covered in the online User Guide. You",
+        "should look there for practical advice on using many of the options",
+        "available in mdrun.[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [REF].edi[ref]",
+        "file. The [REF].edi[ref] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [REF].xvg[ref] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[REF].mdp[ref] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type the table file name is",
+        "modified in a different way: before the file extension an underscore is",
+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
+        "and finally the table number of the interaction type.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [REF].mdp[ref] file.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
+        "starts from the first step of the [REF].tpr[ref] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]\\*0.99 hours. This option is particularly useful in",
+        "combination with setting [TT]nsteps[tt] to -1 either in the mdp or using the",
+        "similarly named command line option. This results in an infinite run,",
+        "terminated only when the time limit set by [TT]-maxh[tt] is reached (if any)"
+        "or upon receiving a signal."
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM signal, it will stop as soon as",
+        "checkpoint file can be written, i.e. after the next global communication step.",
+        "When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop at the next neighbor search step or at the",
+        "second global communication step, whichever happens later.",
+        "In both cases all the usual output will be written to file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows one to terminate",
+        "the simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
+        "IMD remote can be turned on by [TT]-imdpull[tt].",
+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
+        "pulling is used."
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+    t_commrec    *cr;
+    t_filenm      fnm[] = {
+        { efTPR, NULL,      NULL,       ffREAD },
+        { efTRN, "-o",      NULL,       ffWRITE },
+        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
+        { efCPT, "-cpi",    NULL,       ffOPTRD | ffALLOW_MISSING },
+        { efCPT, "-cpo",    NULL,       ffOPTWR },
+        { efSTO, "-c",      "confout",  ffWRITE },
+        { efEDR, "-e",      "ener",     ffWRITE },
+        { efLOG, "-g",      "md",       ffWRITE },
+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
+        { efXVG, "-field",  "field",    ffOPTWR },
+        { efXVG, "-table",  "table",    ffOPTRD },
+        { efXVG, "-tablep", "tablep",   ffOPTRD },
+        { efXVG, "-tableb", "table",    ffOPTRD },
+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
+        { efEDI, "-ei",     "sam",      ffOPTRD },
+        { efXVG, "-eo",     "edsam",    ffOPTWR },
+        { efXVG, "-devout", "deviatie", ffOPTWR },
+        { efXVG, "-runav",  "runaver",  ffOPTWR },
+        { efXVG, "-px",     "pullx",    ffOPTWR },
+        { efXVG, "-pf",     "pullf",    ffOPTWR },
+        { efXVG, "-ro",     "rotation", ffOPTWR },
+        { efLOG, "-ra",     "rotangles", ffOPTWR },
+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
+        { efLOG, "-rt",     "rottorque", ffOPTWR },
+        { efMTX, "-mtx",    "nm",       ffOPTWR },
+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
+        { efDAT, "-membed", "membed",   ffOPTRD },
+        { efTOP, "-mp",     "membed",   ffOPTRD },
+        { efNDX, "-mn",     "membed",   ffOPTRD },
+        { efXVG, "-if",     "imdforces", ffOPTWR },
+        { efXVG, "-swap",   "swapions", ffOPTWR }
+    };
+    const int     NFILE = asize(fnm);
+
+    /* Command line options ! */
+    gmx_bool          bDDBondCheck  = TRUE;
+    gmx_bool          bDDBondComm   = TRUE;
+    gmx_bool          bTunePME      = TRUE;
+    gmx_bool          bVerbose      = FALSE;
+    gmx_bool          bRerunVSite   = FALSE;
+    gmx_bool          bConfout      = TRUE;
+    gmx_bool          bReproducible = FALSE;
+    gmx_bool          bIMDwait      = FALSE;
+    gmx_bool          bIMDterm      = FALSE;
+    gmx_bool          bIMDpull      = FALSE;
+
+    int               npme          = -1;
+    int               nstlist       = 0;
+    int               nmultisim     = 0;
+    int               nstglobalcomm = -1;
+    int               repl_ex_nst   = 0;
+    int               repl_ex_seed  = -1;
+    int               repl_ex_nex   = 0;
+    int               nstepout      = 100;
+    int               resetstep     = -1;
+    gmx_int64_t       nsteps        = -2;   /* the value -2 means that the mdp option will be used */
+    int               imdport       = 8888; /* can be almost anything, 8888 is easy to remember */
+
+    rvec              realddxyz                   = {0, 0, 0};
+    const char       *ddrank_opt[ddrankorderNR+1] =
+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
+    const char       *dddlb_opt[] =
+    { NULL, "auto", "no", "yes", NULL };
+    const char       *thread_aff_opt[threadaffNR+1] =
+    { NULL, "auto", "on", "off", NULL };
+    const char       *nbpu_opt[] =
+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
+    real              rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
+    char             *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
+    real              cpt_period            = 15.0, max_hours = -1;
+    gmx_bool          bTryToAppendFiles     = TRUE;
+    gmx_bool          bKeepAndNumCPT        = FALSE;
+    gmx_bool          bResetCountersHalfWay = FALSE;
+    gmx_output_env_t *oenv                  = NULL;
+
+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+     * But unfortunately we are not allowed to call a function here,
+     * since declarations follow below.
+     */
+    gmx_hw_opt_t    hw_opt = {
+        0, 0, 0, 0, threadaffSEL, 0, 0,
+        { NULL, FALSE, 0, NULL }
+    };
+
+    t_pargs         pa[] = {
+
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddrank_opt},
+          "DD rank order" },
+        { "-npme",    FALSE, etINT, {&npme},
+          "Number of separate ranks to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI threads to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
+          "Whether mdrun should try to set thread affinities" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The lowest logical core number to which mdrun should pin the first thread" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&rdd},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&rconstr},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&dlb_scale},
+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
+          "HIDDENA string containing a vector of the relative sizes in the x "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
+          "HIDDENA string containing a vector of the relative sizes in the y "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
+          "HIDDENA string containing a vector of the relative sizes in the z "
+          "direction of the corresponding DD cells. Only effective with static "
+          "load balancing." },
+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
+          "Calculate non-bonded interactions on" },
+        { "-nstlist", FALSE, etINT, {&nstlist},
+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
+          "Optimize PME load between PP/PME ranks or GPU/CPU" },
+        { "-v",       FALSE, etBOOL, {&bVerbose},
+          "Be loud and noisy" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&cpt_period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etINT64, {&nsteps},
+          "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+        { "-maxh",   FALSE, etREAL, {&max_hours},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-imdport",    FALSE, etINT, {&imdport},
+          "HIDDENIMD listening port" },
+        { "-imdwait",  FALSE, etBOOL, {&bIMDwait},
+          "HIDDENPause the simulation while no IMD client is connected" },
+        { "-imdterm",  FALSE, etBOOL, {&bIMDterm},
+          "HIDDENAllow termination of the simulation from IMD client" },
+        { "-imdpull",  FALSE, etBOOL, {&bIMDpull},
+          "HIDDENAllow pulling in the simulation from IMD client" },
+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-confout", FALSE, etBOOL, {&bConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&nstepout},
+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
+        { "-resetstep", FALSE, etINT, {&resetstep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    unsigned long   Flags;
+    ivec            ddxyz;
+    int             dd_rank_order;
+    gmx_bool        bDoAppendFiles, bStartFromCpt;
+    FILE           *fplog;
+    int             rc;
+    char          **multidir = NULL;
+
+    cr = init_commrec();
+
+    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
+    // With -multi or -multidir, the file names are going to get processed
+    // further (or the working directory changed), so we can't check for their
+    // existence during parsing.  It isn't useful to do any completion based on
+    // file system contents, either.
+    if (is_multisim_option_set(argc, argv))
+    {
+        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
+    }
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    if (!parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
+                           asize(desc), desc, 0, NULL, &oenv))
+    {
+        return 0;
+    }
+
+
+    dd_rank_order = nenum(ddrank_opt);
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", NFILE, fnm))
+    {
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
+    }
+
+
+    if (repl_ex_nst != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
+    }
+
+    if (repl_ex_nex < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim >= 1)
+    {
+#if !GMX_THREAD_MPI
+        gmx_bool bParFn = (multidir == NULL);
+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
+#else
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
+#endif
+    }
+
+    handleRestart(cr, bTryToAppendFiles, NFILE, fnm,
+                  &bDoAppendFiles, &bStartFromCpt);
+
+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
+    Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES  : 0);
+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
+    Flags = Flags | (bStartFromCpt ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+    Flags = Flags | (opt2parg_bSet("-ntomp", asize(pa), pa) ? MD_NTOMPSET : 0);
+    Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
+    Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
+    Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if (MASTER(cr) && !bDoAppendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
+                     Flags & MD_APPENDFILES, &fplog);
+    }
+    else
+    {
+        fplog = NULL;
+    }
+
+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+
+    rc = gmx::mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose,
+                       nstglobalcomm, ddxyz, dd_rank_order, npme, rdd, rconstr,
+                       dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
+                       nbpu_opt[0], nstlist,
+                       nsteps, nstepout, resetstep,
+                       nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                       pforce, cpt_period, max_hours, imdport, Flags);
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !bDoAppendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b870b96d1ec09aa25e998809568da441d706738
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp
@@ -0,0 +1,1479 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "repl_ex.h"
+
+#include "config.h"
+
+#include <math.h>
+
+#include <random>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulaiton
+#define MSRANK(ms, nodeid)  (nodeid)
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int       repl;        /* replica ID */
+    int       nrepl;       /* total number of replica */
+    real      temp;        /* temperature */
+    int       type;        /* replica exchange type from ere enum */
+    real    **q;           /* quantity, e.g. temperature or lambda; first index is ere, second index is replica ID */
+    gmx_bool  bNPT;        /* use constant pressure and temperature */
+    real     *pres;        /* replica pressures */
+    int      *ind;         /* replica indices */
+    int      *allswaps;    /* used for keeping track of all the replica swaps */
+    int       nst;         /* replica exchange interval (number of steps) */
+    int       nex;         /* number of exchanges per interval */
+    int       seed;        /* random seed */
+    int       nattempt[2]; /* number of even and odd replica change attempts */
+    real     *prob_sum;    /* sum of probabilities */
+    int     **nmoves;      /* number of moves between replicas i and j */
+    int      *nexchange;   /* i-th element of the array is the number of exchanges between replica i-1 and i */
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    /* PLUMED */
+    //bDiff = FALSE;
+    //for (s = 1; s < ms->nsim; s++)
+    //{
+    //    if (qall[s] != qall[0])
+    //    {
+            bDiff = TRUE;
+    //    }
+    //}
+    /* END PLUMED */
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+                                    const gmx_multisim_t *ms,
+                                    const t_state *state,
+                                    const t_inputrec *ir,
+                                    int nst, int nex, int init_seed)
+{
+    real                pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == NULL || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from MULTISIM(cr). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    /* PLUMED */
+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
+    // in those cases replicas can share the same temperature.
+    /*
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {*/
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                 /*
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+    */
+    /* END PLUMED */
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (init_seed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = init_seed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = nex;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+    exchange_rvecs(ms, b, state->x, state->natoms);
+    exchange_rvecs(ms, b, state->v, state->natoms);
+}
+
+static void copy_rvecs(rvec *s, rvec *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(s[i], d[i]);
+        }
+    }
+}
+
+static void copy_doubles(const double *s, double *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_reals(const real *s, real *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_ints(const int *s, int *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
+
+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    scopy_rvecs(box, DIM);
+    scopy_rvecs(box_rel, DIM);
+    scopy_rvecs(boxv, DIM);
+    state_local->veta = state->veta;
+    state_local->vol0 = state->vol0;
+    scopy_rvecs(svir_prev, DIM);
+    scopy_rvecs(fvir_prev, DIM);
+    scopy_rvecs(pres_prev, DIM);
+    scopy_doubles(nosehoover_xi, ngtc);
+    scopy_doubles(nosehoover_vxi, ngtc);
+    scopy_doubles(nhpres_xi, nnhpres);
+    scopy_doubles(nhpres_vxi, nnhpres);
+    scopy_doubles(therm_integral, state->ngtc);
+    scopy_rvecs(x, state->natoms);
+    scopy_rvecs(v, state->natoms);
+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+    scopy_reals(lambda, efptNR);
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (state->v)
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          gmx_enerdata_t       *enerd,
+                          real                  vol,
+                          gmx_int64_t           step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" GMX_PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    /* PLUMED */
+    int plumed_test_exchange_pattern=0;
+    /* END PLUMED */
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+        rng.restart( step, 0 );
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                /* roll a number to determine if accepted */
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        /* PLUMED */
+        if(plumedswitch){
+          int partner=re->repl;
+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
+          if(plumed_test_exchange_pattern>0){
+            int *list;
+            snew(list,re->nrepl);
+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
+            plumed_cmd(plumedmain,"getExchangesList",list);
+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
+            sfree(list);
+          }
+
+          for(i=1; i<re->nrepl; i++) {
+            if (i % 2 != m) continue;
+            a = re->ind[i-1];
+            b = re->ind[i];
+            if(re->repl==a) partner=b;
+            if(re->repl==b) partner=a;
+          }
+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
+          plumed_cmd(plumedmain,"GREX calculate",NULL);
+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
+        }
+        /* END PLUMED */
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                /* PLUMED */
+                if(plumedswitch){
+                  real adb,bdb,dplumed;
+                  char buf[300];
+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
+                  delta+=dplumed;
+                  if (bPrint)
+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
+                }
+                /* END PLUMED */
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    /* roll a number to determine if accepted */
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                  /* PLUMED */
+                  if(!plumed_test_exchange_pattern) {
+                    /* standard neighbour swapping */
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  } else {
+                    /* alternative swapping patterns */
+                    tmp       = pind[a];
+                    pind[a]   = pind[b];
+                    pind[b]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  }
+                  /* END PLUMED */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* PLUMED */
+    if(plumed_test_exchange_pattern>0) {
+      for (i = 0; i < re->nrepl; i++)
+      {
+          re->ind[i] = i;
+      }
+    }
+    /* END PLUMED */
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    /* PLUMED */
+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
+    /* END PLUMED */
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_nonatomdata(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_nonatomdata(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..ab12fc4f1789ddbc42674ed56b6268069fdc369a
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/repl_ex.cpp.preplumed
@@ -0,0 +1,1399 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "repl_ex.h"
+
+#include "config.h"
+
+#include <math.h>
+
+#include <random>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformintdistribution.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+//! Rank in the multisimulaiton
+#define MSRANK(ms, nodeid)  (nodeid)
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int       repl;        /* replica ID */
+    int       nrepl;       /* total number of replica */
+    real      temp;        /* temperature */
+    int       type;        /* replica exchange type from ere enum */
+    real    **q;           /* quantity, e.g. temperature or lambda; first index is ere, second index is replica ID */
+    gmx_bool  bNPT;        /* use constant pressure and temperature */
+    real     *pres;        /* replica pressures */
+    int      *ind;         /* replica indices */
+    int      *allswaps;    /* used for keeping track of all the replica swaps */
+    int       nst;         /* replica exchange interval (number of steps) */
+    int       nex;         /* number of exchanges per interval */
+    int       seed;        /* random seed */
+    int       nattempt[2]; /* number of even and odd replica change attempts */
+    real     *prob_sum;    /* sum of probabilities */
+    int     **nmoves;      /* number of moves between replicas i and j */
+    int      *nexchange;   /* i-th element of the array is the number of exchanges between replica i-1 and i */
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    bDiff = FALSE;
+    for (s = 1; s < ms->nsim; s++)
+    {
+        if (qall[s] != qall[0])
+        {
+            bDiff = TRUE;
+        }
+    }
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+                                    const gmx_multisim_t *ms,
+                                    const t_state *state,
+                                    const t_inputrec *ir,
+                                    int nst, int nex, int init_seed)
+{
+    real                pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == NULL || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+    if (!EI_DYNAMICS(ir->eI))
+    {
+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
+         * distinct from MULTISIM(cr). A multi-simulation only runs
+         * with real MPI parallelism, but this does not imply PAR(cr)
+         * is true!
+         *
+         * Since we are using a dynamical integrator, the only
+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
+         * synonymous. The only way for cr->nnodes > 1 to be true is
+         * if we are using DD. */
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
+                      "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (init_seed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = static_cast<int>(gmx::makeRandomSeed());
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = init_seed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl+1);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = nex;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#if GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+    exchange_rvecs(ms, b, state->x, state->natoms);
+    exchange_rvecs(ms, b, state->v, state->natoms);
+}
+
+static void copy_rvecs(rvec *s, rvec *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(s[i], d[i]);
+        }
+    }
+}
+
+static void copy_doubles(const double *s, double *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_reals(const real *s, real *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_ints(const int *s, int *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
+
+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    scopy_rvecs(box, DIM);
+    scopy_rvecs(box_rel, DIM);
+    scopy_rvecs(boxv, DIM);
+    state_local->veta = state->veta;
+    state_local->vol0 = state->vol0;
+    scopy_rvecs(svir_prev, DIM);
+    scopy_rvecs(fvir_prev, DIM);
+    scopy_rvecs(pres_prev, DIM);
+    scopy_doubles(nosehoover_xi, ngtc);
+    scopy_doubles(nosehoover_vxi, ngtc);
+    scopy_doubles(nhpres_xi, nnhpres);
+    scopy_doubles(nhpres_vxi, nnhpres);
+    scopy_doubles(therm_integral, state->ngtc);
+    scopy_rvecs(x, state->natoms);
+    scopy_rvecs(v, state->natoms);
+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+    scopy_reals(lambda, efptNR);
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (state->v)
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          gmx_enerdata_t       *enerd,
+                          real                  vol,
+                          gmx_int64_t           step,
+                          real                  time)
+{
+    int                                  m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real                                 delta = 0;
+    gmx_bool                             bPrint, bMultiEx;
+    gmx_bool                            *bEx      = re->bEx;
+    real                                *prob     = re->prob;
+    int                                 *pind     = re->destinations; /* permuted index */
+    gmx_bool                             bEpot    = FALSE;
+    gmx_bool                             bDLambda = FALSE;
+    gmx_bool                             bVol     = FALSE;
+    gmx::ThreeFry2x64<64>                rng(re->seed, gmx::RandomDomain::ReplicaExchange);
+    gmx::UniformRealDistribution<real>   uniformRealDist;
+    gmx::UniformIntDistribution<int>     uniformNreplDist(0, re->nrepl-1);
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step %" GMX_PRId64 " time %.5f\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        int nself = 0;
+
+        rng.restart( step, 0 );
+
+        for (i = 0; i < re->nex + nself; i++)
+        {
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = uniformNreplDist(rng);
+            i1 = uniformNreplDist(rng);
+            if (i0 == i1)
+            {
+                nself++;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                /* roll a number to determine if accepted */
+                bEx[0] = uniformRealDist(rng) < prob[0];
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+
+        m = (step / re->nst) % 2;
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    /* roll a number to determine if accepted */
+                    bEx[i] = uniformRealDist(rng) < prob[i];
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+prepare_to_do_exchange(struct gmx_repl_ex *re,
+                       const int           replica_id,
+                       int                *maxswap,
+                       gmx_bool           *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < re->nrepl; i++)
+    {
+        if (re->destinations[i] != re->ind[i])
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != re->order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_int64_t step, real time)
+{
+    int j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (DOMAINDECOMP(cr))
+    {
+#if GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+        /* Collect the global state on the master node */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_collect_state(cr->dd, state_local, state);
+        }
+        else
+        {
+            copy_state_nonatomdata(state_local, state);
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_nonatomdata(state, state_local);
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d642a42e20b80cc6b6432ed38f566d72b16012ca
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp
@@ -0,0 +1,1398 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/integrator.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/minimize.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tpi.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "md.h"
+#include "repl_ex.h"
+#include "resource-division.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+//! First step used in pressure scaling
+gmx_int64_t         deform_init_init_step_tpx;
+//! Initial box for pressure scaling
+matrix              deform_init_box_tpx;
+//! MPI variable for use in pressure scaling
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+#if GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t            hw_opt;
+    FILE                   *fplog;
+    t_commrec              *cr;
+    int                     nfile;
+    const t_filenm         *fnm;
+    const gmx_output_env_t *oenv;
+    gmx_bool                bVerbose;
+    int                     nstglobalcomm;
+    ivec                    ddxyz;
+    int                     dd_rank_order;
+    int                     npme;
+    real                    rdd;
+    real                    rconstr;
+    const char             *dddlb_opt;
+    real                    dlb_scale;
+    const char             *ddcsx;
+    const char             *ddcsy;
+    const char             *ddcsz;
+    const char             *nbpu_opt;
+    int                     nstlist_cmdline;
+    gmx_int64_t             nsteps_cmdline;
+    int                     nstepout;
+    int                     resetstep;
+    int                     nmultisim;
+    int                     repl_ex_nst;
+    int                     repl_ex_nex;
+    int                     repl_ex_seed;
+    real                    pforce;
+    real                    cpt_period;
+    real                    max_hours;
+    int                     imdport;
+    unsigned long           Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    try
+    {
+        struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+        struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                                that it's thread-local. This doesn't
+                                                copy pointed-to items, of course,
+                                                but those are all const. */
+        t_commrec *cr;                       /* we need a local version of this */
+        FILE      *fplog = NULL;
+        t_filenm  *fnm;
+
+        fnm = dup_tfn(mc.nfile, mc.fnm);
+
+        cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+        if (MASTER(cr))
+        {
+            fplog = mc.fplog;
+        }
+
+        gmx::mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+                      mc.bVerbose, mc.nstglobalcomm,
+                      mc.ddxyz, mc.dd_rank_order, mc.npme, mc.rdd,
+                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
+                      mc.nbpu_opt, mc.nstlist_cmdline,
+                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+                      mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                                         int nstglobalcomm,
+                                         ivec ddxyz, int dd_rank_order, int npme,
+                                         real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_rank_order   = dd_rank_order;
+    mda->npme            = npme;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+#endif /* GMX_THREAD_MPI */
+
+
+/*! \brief Cost of non-bonded kernels
+ *
+ * We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxnReferenceNstlist = 10;
+//! The values to try when switching
+const int           nstlist_try[] = { 20, 25, 40 };
+//! Number of elements in the neighborsearch list trials.
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* CPU: pair-search is about a factor 2-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_ok    = 1.22;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_max   = 1.3;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/*! \brief Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU, const gmx::CpuInfo &cpuinfo)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    size_t                 nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else if (cpuinfo.feature(gmx::CpuInfo::Feature::X86_Avx512ER))
+    {
+        listfac_ok  = nbnxn_knl_listfac_ok;
+        listfac_max = nbnxn_knl_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with the reference value for nstlist (10).
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = nbnxnReferenceNstlist;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlistWithReferenceNstlist);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_ok) - rlist_inc;
+    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_max) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+    }
+}
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+//! \brief Return the correct integrator function.
+static integrator_t *my_integrator(unsigned int ei)
+{
+    switch (ei)
+    {
+        case eiMD:
+        case eiBD:
+        case eiSD1:
+        case eiVV:
+        case eiVVAK:
+            if (!EI_DYNAMICS(ei))
+            {
+                GMX_THROW(APIError("do_md integrator would be called for a non-dynamical integrator"));
+            }
+            return do_md;
+        case eiSteep:
+            return do_steep;
+        case eiCG:
+            return do_cg;
+        case eiNM:
+            return do_nm;
+        case eiLBFGS:
+            return do_lbfgs;
+        case eiTPI:
+        case eiTPIC:
+            if (!EI_TPI(ei))
+            {
+                GMX_THROW(APIError("do_tpi integrator would be called for a non-TPI integrator"));
+            }
+            return do_tpi;
+        case eiSD2_REMOVED:
+            GMX_THROW(NotImplementedError("SD2 integrator has been removed"));
+        default:
+            GMX_THROW(APIError("Non existing integrator selected"));
+    }
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+             int nstglobalcomm,
+             ivec ddxyz, int dd_rank_order, int npme, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    struct gmx_pme_t        **pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    int                       nthreads_pme = 1;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU            = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+    gmx_print_detected_hardware(fplog, cr, hwinfo);
+
+    if (fplog != NULL)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, mtop);
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU, *hwinfo->cpuInfo);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.n_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#if GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, cr, npme);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(fplog, cr,
+                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+#if GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        if (npme > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog, bUseGPU);
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            t_commrec *cr_old       = cr;
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, nstglobalcomm,
+                                        ddxyz, dd_rank_order, npme, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || npme > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (npme > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        npme = 0;
+    }
+
+    if (bUseGPU && npme < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        npme = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (inputrecDeform(inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (Flags & MD_STARTFROMCPT)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                        cr, ddxyz, &npme,
+                        inputrec, state, &bReadEkin,
+                        (Flags & MD_APPENDFILES),
+                        (Flags & MD_APPENDFILESSET));
+
+        if (bReadEkin)
+        {
+            Flags |= MD_READ_EKIN;
+        }
+    }
+
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    // TODO This should move to do_md(), because it only makes sense
+    // with dynamical integrators, but there is no test coverage and
+    // it interacts with constraints, somehow.
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, npme,
+                                           dd_rank_order,
+                                           rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#if GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite GROMACS\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#if GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update hw_opt for the number of MPI ranks */
+    check_and_update_hw_opt_3(hw_opt);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (EI_TPI(inputrec->eI) &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.n_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
+                                       cr, fplog);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, state->box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        my_integrator(inputrec->eI) (fplog, cr, nfile, fnm,
+                                     oenv, bVerbose,
+                                     nstglobalcomm,
+                                     vsite, constr,
+                                     nstepout, inputrec, mtop,
+                                     fcd, state,
+                                     mdatoms, nrnb, wcycle, ed, fr,
+                                     repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                     cpt_period, max_hours,
+                                     imdport,
+                                     Flags,
+                                     walltime_accounting);
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+    done_ed(&ed);
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp.preplumed b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..3294776ff121cf4b9ee740de07be997e6636fe2d
--- /dev/null
+++ b/patches/gromacs-2016-beta1.diff/src/programs/mdrun/runner.cpp.preplumed
@@ -0,0 +1,1386 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "runner.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "gromacs/commandline/filenm.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/oenv.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/orires.h"
+#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/integrator.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/minimize.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tpi.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "md.h"
+#include "repl_ex.h"
+#include "resource-division.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+//! First step used in pressure scaling
+gmx_int64_t         deform_init_init_step_tpx;
+//! Initial box for pressure scaling
+matrix              deform_init_box_tpx;
+//! MPI variable for use in pressure scaling
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+#if GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t            hw_opt;
+    FILE                   *fplog;
+    t_commrec              *cr;
+    int                     nfile;
+    const t_filenm         *fnm;
+    const gmx_output_env_t *oenv;
+    gmx_bool                bVerbose;
+    int                     nstglobalcomm;
+    ivec                    ddxyz;
+    int                     dd_rank_order;
+    int                     npme;
+    real                    rdd;
+    real                    rconstr;
+    const char             *dddlb_opt;
+    real                    dlb_scale;
+    const char             *ddcsx;
+    const char             *ddcsy;
+    const char             *ddcsz;
+    const char             *nbpu_opt;
+    int                     nstlist_cmdline;
+    gmx_int64_t             nsteps_cmdline;
+    int                     nstepout;
+    int                     resetstep;
+    int                     nmultisim;
+    int                     repl_ex_nst;
+    int                     repl_ex_nex;
+    int                     repl_ex_seed;
+    real                    pforce;
+    real                    cpt_period;
+    real                    max_hours;
+    int                     imdport;
+    unsigned long           Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    try
+    {
+        struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+        struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                                that it's thread-local. This doesn't
+                                                copy pointed-to items, of course,
+                                                but those are all const. */
+        t_commrec *cr;                       /* we need a local version of this */
+        FILE      *fplog = NULL;
+        t_filenm  *fnm;
+
+        fnm = dup_tfn(mc.nfile, mc.fnm);
+
+        cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+        if (MASTER(cr))
+        {
+            fplog = mc.fplog;
+        }
+
+        gmx::mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+                      mc.bVerbose, mc.nstglobalcomm,
+                      mc.ddxyz, mc.dd_rank_order, mc.npme, mc.rdd,
+                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
+                      mc.nbpu_opt, mc.nstlist_cmdline,
+                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+                      mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                                         int nstglobalcomm,
+                                         ivec ddxyz, int dd_rank_order, int npme,
+                                         real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_rank_order   = dd_rank_order;
+    mda->npme            = npme;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+#endif /* GMX_THREAD_MPI */
+
+
+/*! \brief Cost of non-bonded kernels
+ *
+ * We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxnReferenceNstlist = 10;
+//! The values to try when switching
+const int           nstlist_try[] = { 20, 25, 40 };
+//! Number of elements in the neighborsearch list trials.
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* CPU: pair-search is about a factor 2-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_ok    = 1.22;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_max   = 1.3;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/*! \brief Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU, const gmx::CpuInfo &cpuinfo)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    size_t                 nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else if (cpuinfo.feature(gmx::CpuInfo::Feature::X86_Avx512ER))
+    {
+        listfac_ok  = nbnxn_knl_listfac_ok;
+        listfac_max = nbnxn_knl_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with the reference value for nstlist (10).
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = nbnxnReferenceNstlist;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlistWithReferenceNstlist);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_ok) - rlist_inc;
+    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_max) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+    }
+}
+
+/*! \brief Initialize variables for Verlet scheme simulation */
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU,
+                                  const gmx::CpuInfo             &cpuinfo)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU, cpuinfo);
+    }
+}
+
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+namespace gmx
+{
+
+//! \brief Return the correct integrator function.
+static integrator_t *my_integrator(unsigned int ei)
+{
+    switch (ei)
+    {
+        case eiMD:
+        case eiBD:
+        case eiSD1:
+        case eiVV:
+        case eiVVAK:
+            if (!EI_DYNAMICS(ei))
+            {
+                GMX_THROW(APIError("do_md integrator would be called for a non-dynamical integrator"));
+            }
+            return do_md;
+        case eiSteep:
+            return do_steep;
+        case eiCG:
+            return do_cg;
+        case eiNM:
+            return do_nm;
+        case eiLBFGS:
+            return do_lbfgs;
+        case eiTPI:
+        case eiTPIC:
+            if (!EI_TPI(ei))
+            {
+                GMX_THROW(APIError("do_tpi integrator would be called for a non-TPI integrator"));
+            }
+            return do_tpi;
+        case eiSD2_REMOVED:
+            GMX_THROW(NotImplementedError("SD2 integrator has been removed"));
+        default:
+            GMX_THROW(APIError("Non existing integrator selected"));
+    }
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+             int nstglobalcomm,
+             ivec ddxyz, int dd_rank_order, int npme, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    struct gmx_pme_t        **pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
+    gmx_wallcycle_t           wcycle;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    int                       nthreads_pme = 1;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU            = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+    gmx_print_detected_hardware(fplog, cr, hwinfo);
+
+    if (fplog != NULL)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, mtop);
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU, *hwinfo->cpuInfo);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.n_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#if GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, cr, npme);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(fplog, cr,
+                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+#if GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        if (npme > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog, bUseGPU);
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            t_commrec *cr_old       = cr;
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, nstglobalcomm,
+                                        ddxyz, dd_rank_order, npme, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || npme > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#if !GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#if GMX_THREAD_MPI
+                  "but the number of MPI-threads (option -ntmpi) is not set or is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (npme > 0)
+        {
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        npme = 0;
+    }
+
+    if (bUseGPU && npme < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        npme = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (inputrecDeform(inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (Flags & MD_STARTFROMCPT)
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        gmx_bool bReadEkin;
+
+        load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                        cr, ddxyz, &npme,
+                        inputrec, state, &bReadEkin,
+                        (Flags & MD_APPENDFILES),
+                        (Flags & MD_APPENDFILESSET));
+
+        if (bReadEkin)
+        {
+            Flags |= MD_READ_EKIN;
+        }
+    }
+
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    // TODO This should move to do_md(), because it only makes sense
+    // with dynamical integrators, but there is no test coverage and
+    // it interacts with constraints, somehow.
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, npme,
+                                           dd_rank_order,
+                                           rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#if GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite GROMACS\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#if GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update hw_opt for the number of MPI ranks */
+    check_and_update_hw_opt_3(hw_opt);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (EI_TPI(inputrec->eI) &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.n_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
+                                       cr, fplog);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (EI_DYNAMICS(inputrec->eI))
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, state->box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        my_integrator(inputrec->eI) (fplog, cr, nfile, fnm,
+                                     oenv, bVerbose,
+                                     nstglobalcomm,
+                                     vsite, constr,
+                                     nstepout, inputrec, mtop,
+                                     fcd, state,
+                                     mdatoms, nrnb, wcycle, ed, fr,
+                                     repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                     cpt_period, max_hours,
+                                     imdport,
+                                     Flags,
+                                     walltime_accounting);
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+    done_ed(&ed);
+
+#if GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
+
+} // namespace gmx
diff --git a/patches/gromacs-5.0.7.config b/patches/gromacs-5.0.7.config
new file mode 100644
index 0000000000000000000000000000000000000000..03f65f9e10a72823dcc4648c695166cffa631af6
--- /dev/null
+++ b/patches/gromacs-5.0.7.config
@@ -0,0 +1,24 @@
+
+
+function plumed_preliminary_test(){
+# check if the README contains the word GROMACS and if gromacs has been already configured
+  grep -q GROMACS README 1>/dev/null 2>/dev/null
+}
+
+function plumed_patch_info(){
+cat << EOF
+
+PLUMED can be incorporated into gromacs using the standard patching procedure.
+Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
+
+To enable PLUMED in a gromacs simulation one should use
+mdrun with an extra -plumed flag. The flag can be used to
+specify the name of the PLUMED input file, e.g.:
+
+gmx mdrun -plumed plumed.dat
+
+For more information on gromacs you should visit http://www.gromacs.org
+
+EOF
+}
+
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-5.0.7.diff/src/gromacs/CMakeLists.txt
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/CMakeLists.txt
rename to patches/gromacs-5.0.7.diff/src/gromacs/CMakeLists.txt
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-5.0.7.diff/src/gromacs/CMakeLists.txt.preplumed
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/CMakeLists.txt.preplumed
rename to patches/gromacs-5.0.7.diff/src/gromacs/CMakeLists.txt.preplumed
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/mdlib/force.c b/patches/gromacs-5.0.7.diff/src/gromacs/mdlib/force.c
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/mdlib/force.c
rename to patches/gromacs-5.0.7.diff/src/gromacs/mdlib/force.c
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/mdlib/force.c.preplumed b/patches/gromacs-5.0.7.diff/src/gromacs/mdlib/force.c.preplumed
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/mdlib/force.c.preplumed
rename to patches/gromacs-5.0.7.diff/src/gromacs/mdlib/force.c.preplumed
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/mdlib/minimize.c b/patches/gromacs-5.0.7.diff/src/gromacs/mdlib/minimize.c
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/mdlib/minimize.c
rename to patches/gromacs-5.0.7.diff/src/gromacs/mdlib/minimize.c
diff --git a/patches/gromacs-5.0.4.diff/src/gromacs/mdlib/minimize.c.preplumed b/patches/gromacs-5.0.7.diff/src/gromacs/mdlib/minimize.c.preplumed
similarity index 100%
rename from patches/gromacs-5.0.4.diff/src/gromacs/mdlib/minimize.c.preplumed
rename to patches/gromacs-5.0.7.diff/src/gromacs/mdlib/minimize.c.preplumed
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c b/patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c
similarity index 98%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c
index c881c0170f906b1e8399148f878b71eae5d7af8f..57f06b074dafbe3f6e7a9607afebb0bf45cb9d54 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -203,7 +203,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
     gmx_bool          bUpdateDoLR;
     real              dvdl_constr;
-    rvec             *cbuf = NULL;
+    rvec             *cbuf        = NULL;
+    int               cbuf_nalloc = 0;
     matrix            lastbox;
     real              veta_save, scalevir, tracevir;
     real              vetanew = 0;
@@ -259,10 +260,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
     bVV = EI_VV(ir->eI);
-    if (bVV) /* to store the initial velocities while computing virial */
-    {
-        snew(cbuf, top_global->natoms);
-    }
     /* all the iteratative cases - only if there are constraints */
     bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
     gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
@@ -365,6 +362,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
     }
 
+    if (bRerunMD && fr->cutoff_scheme == ecutsVERLET && ir->opts.ngener > 1 && fr->nbv && fr->nbv->bUseGPU)
+    {
+        gmx_fatal(FARGS, "The Verlet scheme on GPUs does not support energy groups, so your rerun should probably use a .tpr file without energy groups, or mdrun -nb auto");
+    }
+
     if (DEFORM(*ir))
     {
         tMPI_Thread_mutex_lock(&deform_init_box_mutex);
@@ -551,16 +553,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     nstfep = ir->fepvals->nstdhdl;
     if (ir->bExpanded)
     {
-        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
+        nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
     }
     if (repl_ex_nst > 0)
     {
         nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
     }
 
-    /* I'm assuming we need global communication the first time! MRS */
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
     cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bStopCM ? CGLO_STOPCM : 0)
                   | (bVV ? CGLO_PRESSURE : 0)
                   | (bVV ? CGLO_CONSTRAINT : 0)
                   | (bRerunMD ? CGLO_RERUNMD : 0)
@@ -1187,6 +1194,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         if (bVV && !bStartingFromCpt && !bRerunMD)
         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
         {
+            rvec *vbuf = NULL;
+
             wallcycle_start(wcycle, ewcUPDATE);
             if (ir->eI == eiVV && bInitStep)
             {
@@ -1196,7 +1205,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                  * revert back to the initial coordinates
                  * so that the input is actually the initial step.
                  */
-                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v, vbuf, 0, state->natoms); /* should make this better for parallelizing? */
             }
             else
             {
@@ -1318,6 +1328,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                     | (bTemp ? CGLO_TEMPERATURE : 0)
                                     | (bPres ? CGLO_PRESSURE : 0)
                                     | (bPres ? CGLO_CONSTRAINT : 0)
+                                    | (bStopCM ? CGLO_STOPCM : 0)
                                     | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
                                     | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
                                     | CGLO_SCALEEKIN
@@ -1378,9 +1389,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 }
             }
             /* if it's the initial step, we performed this first step just to get the constraint virial */
-            if (bInitStep && ir->eI == eiVV)
+            if (ir->eI == eiVV && bInitStep)
             {
-                copy_rvecn(cbuf, state->v, 0, state->natoms);
+                copy_rvecn(vbuf, state->v, 0, state->natoms);
+                sfree(vbuf);
             }
             wallcycle_stop(wcycle, ewcUPDATE);
         }
@@ -1636,6 +1648,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
                 if (ir->eI == eiVVAK)
                 {
+                    /* We probably only need md->homenr, not state->natoms */
+                    if (state->natoms > cbuf_nalloc)
+                    {
+                        cbuf_nalloc = state->natoms;
+                        srenew(cbuf, cbuf_nalloc);
+                    }
                     copy_rvecn(state->x, cbuf, 0, state->natoms);
                 }
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c.preplumed b/patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c.preplumed
similarity index 98%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c.preplumed
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c.preplumed
index 3d98d597c7d0e9c009e02d816e631b3f36846a25..0b12964ea5f7b09e4cf241c335e3256ed8378d8e 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/md.c.preplumed
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/md.c.preplumed
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -197,7 +197,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
     gmx_bool          bUpdateDoLR;
     real              dvdl_constr;
-    rvec             *cbuf = NULL;
+    rvec             *cbuf        = NULL;
+    int               cbuf_nalloc = 0;
     matrix            lastbox;
     real              veta_save, scalevir, tracevir;
     real              vetanew = 0;
@@ -247,10 +248,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
     bVV = EI_VV(ir->eI);
-    if (bVV) /* to store the initial velocities while computing virial */
-    {
-        snew(cbuf, top_global->natoms);
-    }
     /* all the iteratative cases - only if there are constraints */
     bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
     gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
@@ -353,6 +350,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
     }
 
+    if (bRerunMD && fr->cutoff_scheme == ecutsVERLET && ir->opts.ngener > 1 && fr->nbv && fr->nbv->bUseGPU)
+    {
+        gmx_fatal(FARGS, "The Verlet scheme on GPUs does not support energy groups, so your rerun should probably use a .tpr file without energy groups, or mdrun -nb auto");
+    }
+
     if (DEFORM(*ir))
     {
         tMPI_Thread_mutex_lock(&deform_init_box_mutex);
@@ -539,16 +541,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     nstfep = ir->fepvals->nstdhdl;
     if (ir->bExpanded)
     {
-        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
+        nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
     }
     if (repl_ex_nst > 0)
     {
         nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
     }
 
-    /* I'm assuming we need global communication the first time! MRS */
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
     cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bStopCM ? CGLO_STOPCM : 0)
                   | (bVV ? CGLO_PRESSURE : 0)
                   | (bVV ? CGLO_CONSTRAINT : 0)
                   | (bRerunMD ? CGLO_RERUNMD : 0)
@@ -1089,6 +1096,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         if (bVV && !bStartingFromCpt && !bRerunMD)
         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
         {
+            rvec *vbuf = NULL;
+
             wallcycle_start(wcycle, ewcUPDATE);
             if (ir->eI == eiVV && bInitStep)
             {
@@ -1098,7 +1107,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                  * revert back to the initial coordinates
                  * so that the input is actually the initial step.
                  */
-                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+                snew(vbuf, state->natoms);
+                copy_rvecn(state->v, vbuf, 0, state->natoms); /* should make this better for parallelizing? */
             }
             else
             {
@@ -1220,6 +1230,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                     | (bTemp ? CGLO_TEMPERATURE : 0)
                                     | (bPres ? CGLO_PRESSURE : 0)
                                     | (bPres ? CGLO_CONSTRAINT : 0)
+                                    | (bStopCM ? CGLO_STOPCM : 0)
                                     | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
                                     | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
                                     | CGLO_SCALEEKIN
@@ -1280,9 +1291,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 }
             }
             /* if it's the initial step, we performed this first step just to get the constraint virial */
-            if (bInitStep && ir->eI == eiVV)
+            if (ir->eI == eiVV && bInitStep)
             {
-                copy_rvecn(cbuf, state->v, 0, state->natoms);
+                copy_rvecn(vbuf, state->v, 0, state->natoms);
+                sfree(vbuf);
             }
             wallcycle_stop(wcycle, ewcUPDATE);
         }
@@ -1538,6 +1550,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
                 if (ir->eI == eiVVAK)
                 {
+                    /* We probably only need md->homenr, not state->natoms */
+                    if (state->natoms > cbuf_nalloc)
+                    {
+                        cbuf_nalloc = state->natoms;
+                        srenew(cbuf, cbuf_nalloc);
+                    }
                     copy_rvecn(state->x, cbuf, 0, state->natoms);
                 }
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp
similarity index 99%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp
index e9fbf485821e175988bb7cddc43b00b0c4f80e8f..eee02d4ee571de612b8a1d6e1747c192b4af509a 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -668,14 +668,14 @@ int gmx_mdrun(int argc, char *argv[])
         gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
     }
 
-    if (nmultisim > 1)
+    if (nmultisim >= 1)
     {
 #ifndef GMX_THREAD_MPI
         gmx_bool bParFn = (multidir == NULL);
         init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 #else
-        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-                  "Please compile GROMACS with MPI support");
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
 #endif
     }
 
@@ -769,6 +769,9 @@ int gmx_mdrun(int argc, char *argv[])
     {
         gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
                      !bSepPot, Flags & MD_APPENDFILES, &fplog);
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
         please_cite(fplog, "Hess2008b");
         please_cite(fplog, "Spoel2005a");
         please_cite(fplog, "Lindahl2001a");
@@ -821,12 +824,6 @@ int gmx_mdrun(int argc, char *argv[])
                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
                   pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
 
-    /* PLUMED */
-    if(plumedswitch){
-      plumed_finalize(plumedmain);
-    }
-    /* END PLUMED */
-
     /* Log file has to be closed in mdrunner if we are appending to it
        (fplog not set here) */
     if (MASTER(cr) && !bAppendFiles)
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp.preplumed
similarity index 99%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp.preplumed
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp.preplumed
index 6bac3f08e7e6709a81be1606711027302cc034f8..e3571f634057cc40debe673695b4da6287b95e13 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/mdrun.cpp.preplumed
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/mdrun.cpp.preplumed
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -661,14 +661,14 @@ int gmx_mdrun(int argc, char *argv[])
         gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
     }
 
-    if (nmultisim > 1)
+    if (nmultisim >= 1)
     {
 #ifndef GMX_THREAD_MPI
         gmx_bool bParFn = (multidir == NULL);
         init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 #else
-        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-                  "Please compile GROMACS with MPI support");
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
 #endif
     }
 
@@ -762,6 +762,9 @@ int gmx_mdrun(int argc, char *argv[])
     {
         gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
                      !bSepPot, Flags & MD_APPENDFILES, &fplog);
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
         please_cite(fplog, "Hess2008b");
         please_cite(fplog, "Spoel2005a");
         please_cite(fplog, "Lindahl2001a");
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c b/patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c
similarity index 99%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c
index cfb0b7f354b5277c0a61d02955d1f5c063b8dd8f..af47b1d3b4071d6a2e17b73c6137aeba12ad5447 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -415,7 +415,7 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
     snew(re->order, re->nrepl);
     for (i = 0; i < re->nrepl; i++)
     {
-        snew(re->cyclic[i], re->nrepl);
+        snew(re->cyclic[i], re->nrepl+1);
         snew(re->order[i], re->nrepl);
     }
     /* allocate space for the functions storing the data for the replicas */
diff --git a/patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c.preplumed b/patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c.preplumed
similarity index 99%
rename from patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c.preplumed
rename to patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c.preplumed
index 46a9bc0113cd1cd6755cca7903cc71128c2386f4..5db5d309cd12f340c8a69a72eee95bcf39af6031 100644
--- a/patches/gromacs-5.0.4.diff/src/programs/mdrun/repl_ex.c.preplumed
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/repl_ex.c.preplumed
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -400,7 +400,7 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
     snew(re->order, re->nrepl);
     for (i = 0; i < re->nrepl; i++)
     {
-        snew(re->cyclic[i], re->nrepl);
+        snew(re->cyclic[i], re->nrepl+1);
         snew(re->order[i], re->nrepl);
     }
     /* allocate space for the functions storing the data for the replicas */
diff --git a/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c b/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c
new file mode 100644
index 0000000000000000000000000000000000000000..f324edbde1b50a2d661aa66b4de9ba7ac2019234
--- /dev/null
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c
@@ -0,0 +1,1877 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <signal.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "gromacs/utility/smalloc.h"
+#include "sysstuff.h"
+#include "copyrite.h"
+#include "force.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
+#include "network.h"
+#include "names.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "deform.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
+#include "gromacs/gmxpreprocess/calc_verletbuf.h"
+#include "gmx_fatal_collective.h"
+#include "membed.h"
+#include "macros.h"
+#include "gmx_thread_affinity.h"
+#include "inputrec.h"
+
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain; 
+/* END PLUMED */
+
+typedef struct {
+    gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_int64_t         deform_init_init_step_tpx;
+matrix              deform_init_box_tpx;
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
+#ifdef GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t    hw_opt;
+    FILE           *fplog;
+    t_commrec      *cr;
+    int             nfile;
+    const t_filenm *fnm;
+    output_env_t    oenv;
+    gmx_bool        bVerbose;
+    gmx_bool        bCompact;
+    int             nstglobalcomm;
+    ivec            ddxyz;
+    int             dd_node_order;
+    real            rdd;
+    real            rconstr;
+    const char     *dddlb_opt;
+    real            dlb_scale;
+    const char     *ddcsx;
+    const char     *ddcsy;
+    const char     *ddcsz;
+    const char     *nbpu_opt;
+    int             nstlist_cmdline;
+    gmx_int64_t     nsteps_cmdline;
+    int             nstepout;
+    int             resetstep;
+    int             nmultisim;
+    int             repl_ex_nst;
+    int             repl_ex_nex;
+    int             repl_ex_seed;
+    real            pforce;
+    real            cpt_period;
+    real            max_hours;
+    const char     *deviceOptions;
+    int             imdport;
+    unsigned long   Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                            that it's thread-local. This doesn't
+                                            copy pointed-to items, of course,
+                                            but those are all const. */
+    t_commrec *cr;                       /* we need a local version of this */
+    FILE      *fplog = NULL;
+    t_filenm  *fnm;
+
+    fnm = dup_tfn(mc.nfile, mc.fnm);
+
+    cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+    if (MASTER(cr))
+    {
+        fplog = mc.fplog;
+    }
+
+    mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+             mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+             mc.ddxyz, mc.dd_node_order, mc.rdd,
+             mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+             mc.ddcsx, mc.ddcsy, mc.ddcsz,
+             mc.nbpu_opt, mc.nstlist_cmdline,
+             mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+             mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+             mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.imdport, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+                                         gmx_bool bCompact, int nstglobalcomm,
+                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         const char *deviceOptions, unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->bCompact        = bCompact;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_node_order   = dd_node_order;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->deviceOptions   = deviceOptions;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                        const gmx_hw_opt_t  *hw_opt,
+                                        int                  nthreads_tot,
+                                        int                  ngpu)
+{
+    int nthreads_tmpi;
+
+    /* There are no separate PME nodes here, as we ensured in
+     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+     * and a conditional ensures we would not have ended up here.
+     * Note that separate PME nodes might be switched on later.
+     */
+    if (ngpu > 0)
+    {
+        nthreads_tmpi = ngpu;
+        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+        {
+            nthreads_tmpi = nthreads_tot;
+        }
+    }
+    else if (hw_opt->nthreads_omp > 0)
+    {
+        /* Here we could oversubscribe, when we do, we issue a warning later */
+        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+    }
+    else
+    {
+        /* TODO choose nthreads_omp based on hardware topology
+           when we have a hardware topology detection library */
+        /* In general, when running up to 4 threads, OpenMP should be faster.
+         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+         * even on two CPUs it's usually faster (but with many OpenMP threads
+         * it could be faster not to use HT, currently we always use HT).
+         * On Nehalem/Westmere we want to avoid running 16 threads over
+         * two CPUs with HT, so we need a limit<16; thus we use 12.
+         * A reasonable limit for Intel Sandy and Ivy bridge,
+         * not knowing the topology, is 16 threads.
+         * Below we check for Intel and AVX, which for now includes
+         * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
+         * model numbers we ensure also future Intel CPUs are covered.
+         */
+        const int nthreads_omp_always_faster             =  4;
+        const int nthreads_omp_always_faster_Nehalem     = 12;
+        const int nthreads_omp_always_faster_Intel_AVX   = 16;
+        gmx_bool  bIntelAVX;
+
+        bIntelAVX =
+            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+             gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX));
+
+        if (nthreads_tot <= nthreads_omp_always_faster ||
+            ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+             (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX)))
+        {
+            /* Use pure OpenMP parallelization */
+            nthreads_tmpi = 1;
+        }
+        else
+        {
+            /* Don't use OpenMP parallelization */
+            nthreads_tmpi = nthreads_tot;
+        }
+    }
+
+    return nthreads_tmpi;
+}
+
+
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
+ */
+static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+                            gmx_hw_opt_t *hw_opt,
+                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+                            const t_commrec *cr,
+                            FILE *fplog)
+{
+    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+    int      min_atoms_per_mpi_thread;
+    char    *env;
+    char     sbuf[STRLEN];
+    gmx_bool bCanUseGPU;
+
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        /* Trivial, return right away */
+        return hw_opt->nthreads_tmpi;
+    }
+
+    nthreads_hw = hwinfo->nthreads_hw_avail;
+
+    /* How many total (#tMPI*#OpenMP) threads can we start? */
+    if (hw_opt->nthreads_tot > 0)
+    {
+        nthreads_tot_max = hw_opt->nthreads_tot;
+    }
+    else
+    {
+        nthreads_tot_max = nthreads_hw;
+    }
+
+    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
+                  hwinfo->gpu_info.ncuda_dev_compatible > 0);
+    if (bCanUseGPU)
+    {
+        ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
+    }
+    else
+    {
+        ngpu = 0;
+    }
+
+    if (inputrec->cutoff_scheme == ecutsGROUP)
+    {
+        /* We checked this before, but it doesn't hurt to do it once more */
+        assert(hw_opt->nthreads_omp == 1);
+    }
+
+    nthreads_tmpi =
+        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+
+    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+    {
+        /* Dims/steps are divided over the nodes iso splitting the atoms */
+        min_atoms_per_mpi_thread = 0;
+    }
+    else
+    {
+        if (bCanUseGPU)
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+        }
+        else
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+        }
+    }
+
+    /* Check if an algorithm does not support parallel simulation.  */
+    if (nthreads_tmpi != 1 &&
+        ( inputrec->eI == eiLBFGS ||
+          inputrec->coulombtype == eelEWALD ) )
+    {
+        nthreads_tmpi = 1;
+
+        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+        {
+            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+        }
+    }
+    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+    {
+        /* the thread number was chosen automatically, but there are too many
+           threads (too few atoms per thread) */
+        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+
+        /* Avoid partial use of Hyper-Threading */
+        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+        {
+            nthreads_new = nthreads_hw/2;
+        }
+
+        /* Avoid large prime numbers in the thread count */
+        if (nthreads_new >= 6)
+        {
+            /* Use only 6,8,10 with additional factors of 2 */
+            int fac;
+
+            fac = 2;
+            while (3*fac*2 <= nthreads_new)
+            {
+                fac *= 2;
+            }
+
+            nthreads_new = (nthreads_new/fac)*fac;
+        }
+        else
+        {
+            /* Avoid 5 */
+            if (nthreads_new == 5)
+            {
+                nthreads_new = 4;
+            }
+        }
+
+        nthreads_tmpi = nthreads_new;
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
+        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+    }
+
+    return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxn_reference_nstlist = 10;
+/* The values to try when switching  */
+const int           nstlist_try[] = { 20, 25, 40 };
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/* Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlist_nstlist10, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    int                    nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with nstlist=10.
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = 10;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlist_nstlist10);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlist_nstlist10 + rlist_inc)*pow(listfac_ok, 1.0/3.0) - rlist_inc;
+    rlist_max = (rlist_nstlist10 + rlist_inc)*pow(listfac_max, 1.0/3.0) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+        ir->rlistlong = rlist_new;
+    }
+}
+
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+            ir->rlistlong = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU);
+    }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+                                     t_inputrec *ir,
+                                     gmx_mtop_t *mtop, real box_vol)
+{
+    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+
+    ir->cutoff_scheme = ecutsVERLET;
+    ir->verletbuf_tol = 0.005;
+
+    if (ir->rcoulomb != ir->rvdw)
+    {
+        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+    }
+
+    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+    {
+        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+    }
+    else if (ir_vdw_switched(ir) || ir_coulomb_switched(ir))
+    {
+        if (ir_vdw_switched(ir) && ir->vdw_modifier == eintmodNONE)
+        {
+            ir->vdwtype = evdwCUT;
+
+            switch (ir->vdwtype)
+            {
+                case evdwSHIFT:  ir->vdw_modifier = eintmodFORCESWITCH; break;
+                case evdwSWITCH: ir->vdw_modifier = eintmodPOTSWITCH; break;
+                default: gmx_fatal(FARGS, "The Verlet scheme does not support Van der Waals interactions of type '%s'", evdw_names[ir->vdwtype]);
+            }
+        }
+        if (ir_coulomb_switched(ir) && ir->coulomb_modifier == eintmodNONE)
+        {
+            if (EEL_FULL(ir->coulombtype))
+            {
+                /* With full electrostatic only PME can be switched */
+                ir->coulombtype      = eelPME;
+                ir->coulomb_modifier = eintmodPOTSHIFT;
+            }
+            else
+            {
+                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+                ir->coulombtype      = eelRF;
+                ir->epsilon_rf       = 0.0;
+                ir->coulomb_modifier = eintmodPOTSHIFT;
+            }
+        }
+
+        /* We set the pair energy error tolerance to a small number.
+         * Note that this is only for testing. For production the user
+         * should think about this and set the mdp options.
+         */
+        ir->verletbuf_tol = 1e-4;
+    }
+
+    if (inputrec2nboundeddim(ir) != 3)
+    {
+        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+    }
+
+    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+    {
+        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+    }
+
+    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        verletbuf_list_setup_t ls;
+
+        verletbuf_get_list_setup(TRUE, FALSE, &ls);
+        calc_verlet_buffer_size(mtop, box_vol, ir, -1, &ls, NULL, &ir->rlist);
+    }
+    else
+    {
+        real rlist_fac;
+
+        if (EI_MD(ir->eI))
+        {
+            rlist_fac       = 1 + verlet_buffer_ratio_NVE_T0;
+        }
+        else
+        {
+            rlist_fac       = 1 + verlet_buffer_ratio_nodynamics;
+        }
+        ir->verletbuf_tol   = -1;
+        ir->rlist           = rlist_fac*max(ir->rvdw, ir->rcoulomb);
+    }
+
+    gmx_mtop_remove_chargegroups(mtop);
+}
+
+static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
+{
+    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+            hw_opt->nthreads_tot,
+            hw_opt->nthreads_tmpi,
+            hw_opt->nthreads_omp,
+            hw_opt->nthreads_omp_pme,
+            hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+}
+
+/* Checks we can do when we don't (yet) know the cut-off scheme */
+static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
+                                      gmx_bool      bIsSimMaster)
+{
+    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+
+#ifndef GMX_THREAD_MPI
+    if (hw_opt->nthreads_tot > 0)
+    {
+        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+#endif
+
+#ifndef GMX_OPENMP
+    if (hw_opt->nthreads_omp > 1)
+    {
+        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
+    }
+    hw_opt->nthreads_omp = 1;
+#endif
+
+    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+    {
+        /* We have the same number of OpenMP threads for PP and PME processes,
+         * thus we can perform several consistency checks.
+         */
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+        }
+
+        if (hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+    }
+
+#ifndef GMX_OPENMP
+    if (hw_opt->nthreads_omp > 1)
+    {
+        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+    }
+#endif
+
+    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+    {
+        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+    }
+
+    if (hw_opt->nthreads_tot == 1)
+    {
+        hw_opt->nthreads_tmpi = 1;
+
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    /* Parse GPU IDs, if provided.
+     * We check consistency with the tMPI thread count later.
+     */
+    gmx_parse_gpu_ids(&hw_opt->gpu_opt);
+
+#ifdef GMX_THREAD_MPI
+    if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+    {
+        /* Set the number of MPI threads equal to the number of GPUs */
+        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+
+        if (hw_opt->nthreads_tot > 0 &&
+            hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+        {
+            /* We have more GPUs than total threads requested.
+             * We choose to (later) generate a mismatch error,
+             * instead of launching more threads than requested.
+             */
+            hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+        }
+    }
+#endif
+
+    if (debug)
+    {
+        print_hw_opt(debug, hw_opt);
+    }
+}
+
+/* Checks we can do when we know the cut-off scheme */
+static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
+                                      int           cutoff_scheme)
+{
+    if (cutoff_scheme == ecutsGROUP)
+    {
+        /* We only have OpenMP support for PME only nodes */
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+                      ecutscheme_names[cutoff_scheme],
+                      ecutscheme_names[ecutsVERLET]);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    if (debug)
+    {
+        print_hw_opt(debug, hw_opt);
+    }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    char sbuf[STEPSTRSIZE];
+
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char stmp[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", stmp);
+    }
+}
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+static void free_gpu_resources(const t_forcerec *fr,
+                               const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On rank %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+             gmx_bool bCompact, int nstglobalcomm,
+             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             const char *deviceOptions, int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD, bCantUseGPU;
+    double                    nodetime = 0, realtime;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    real                      tmpr1, tmpr2;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    gmx_pme_t                *pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc;
+    char                     *gro;
+    gmx_wallcycle_t           wcycle;
+    gmx_bool                  bReadEkin;
+    int                       list;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    t_commrec                *cr_old       = cr;
+    int                       nthreads_pme = 1;
+    int                       nthreads_pp  = 1;
+    gmx_membed_t              membed       = NULL;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU      = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+    /* Rerun execution time is dominated by I/O and pair search, so
+     * GPUs are not very useful, plus they do not support more than
+     * one energy group. Don't select them when they can't be used,
+     * unless the user requested it, then fatal_error is called later.
+     *
+     * TODO it would be nice to notify the user that if this check
+     * causes GPUs not to be used that this is what is happening, and
+     * why, but that will be easier to do after some future
+     * cleanup. */
+    bCantUseGPU = bRerunMD && (inputrec->opts.ngener > 1);
+    bTryUseGPU  = bTryUseGPU && !(bCantUseGPU && !bForceUseGPU);
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+
+        if (inputrec->cutoff_scheme != ecutsVERLET &&
+            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+        {
+            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+        }
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_acceleration_supported(fplog, cr, inputrec, bUseGPU))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+                              "      (for quick performance testing you can use the -testverlet option)\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#ifdef GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+
+        if (inputrec->eI == eiSD2)
+        {
+            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
+                          "it is slower than integrator %s and is slightly less accurate\n"
+                          "with constraints. Use the %s integrator.",
+                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
+        }
+    }
+
+    /* Check for externally set OpenMP affinity and turn off internal
+     * pinning if any is found. We need to do this check early to tell
+     * thread-MPI whether it should do pinning when spawning threads.
+     * TODO: the above no longer holds, we should move these checks down
+     */
+    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr));
+
+    if (SIMMASTER(cr))
+    {
+#ifdef GMX_THREAD_MPI
+        /* Early check for externally set process affinity.
+         * With thread-MPI this is needed as pinning might get turned off,
+         * which needs to be known before starting thread-MPI.
+         * With thread-MPI hw_opt is processed here on the master rank
+         * and passed to the other ranks later, so we only do this on master.
+         */
+        gmx_check_thread_affinity_set(fplog,
+                                      NULL,
+                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
+#ifdef GMX_THREAD_MPI
+        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+#endif
+
+        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+            cr->npmenodes <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
+        }
+    }
+
+#ifdef GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog);
+        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, bCompact, nstglobalcomm,
+                                        ddxyz, dd_node_order, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours, deviceOptions,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    /* g_membed initialisation *
+     * Because we change the mtop, init_membed is called before the init_parallel *
+     * (in case we ever want to make it run in parallel) */
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+    }
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+                  "but the number of threads (option -nt) is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , ShortProgram()
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (cr->npmenodes > 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL,
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        cr->npmenodes = 0;
+    }
+
+    if (bUseGPU && cr->npmenodes < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        cr->npmenodes = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (DEFORM(*inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+        {
+            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                            cr, ddxyz,
+                            inputrec, state, &bReadEkin,
+                            (Flags & MD_APPENDFILES),
+                            (Flags & MD_APPENDFILESSET));
+
+            if (bReadEkin)
+            {
+                Flags |= MD_READ_EKIN;
+            }
+        }
+    }
+
+    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+#ifdef GMX_THREAD_MPI
+        /* With thread MPI only the master node/thread exists in mdrun.c,
+         * therefore non-master nodes need to open the "seppot" log file here.
+         */
+        || (!MASTER(cr) && (Flags & MD_SEPPOT))
+#endif
+        )
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+
+        make_dd_communicators(fplog, cr, dd_node_order);
+
+        /* Set overallocation to avoid frequent reallocation of arrays */
+        set_over_alloc_dd(TRUE);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#ifdef GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite Gromacs\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#ifdef GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.ncuda_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    /* TODO nthreads_pp is only used for pinning threads.
+     * This is a temporary solution until we have a hw topology library.
+     */
+    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tabletf", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* version for PCA_NOT_READ_NODE (see md.c) */
+        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+           "nofile","nofile","nofile","nofile",FALSE,pforce);
+         */
+        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (integrator[inputrec->eI].func == do_md)
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->ePull != epullNO)
+        {
+            /* Initialize pull code */
+            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
+                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+
+            setup_dd_grid(fplog, cr->dd);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+                                      oenv, bVerbose, bCompact,
+                                      nstglobalcomm,
+                                      vsite, constr,
+                                      nstepout, inputrec, mtop,
+                                      fcd, state,
+                                      mdatoms, nrnb, wcycle, ed, fr,
+                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                      membed,
+                                      cpt_period, max_hours,
+                                      deviceOptions,
+                                      imdport,
+                                      Flags,
+                                      walltime_accounting);
+
+        if (inputrec->ePull != epullNO)
+        {
+            finish_pull(inputrec->pull);
+        }
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+    }
+    else
+    {
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr);
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        sfree(membed);
+    }
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+#ifdef GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
diff --git a/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c.preplumed b/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..b951255d7c875af3938ab8740abf8035d537cf75
--- /dev/null
+++ b/patches/gromacs-5.0.7.diff/src/programs/mdrun/runner.c.preplumed
@@ -0,0 +1,1865 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <signal.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "gromacs/utility/smalloc.h"
+#include "sysstuff.h"
+#include "copyrite.h"
+#include "force.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
+#include "network.h"
+#include "names.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "deform.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
+#include "gromacs/gmxpreprocess/calc_verletbuf.h"
+#include "gmx_fatal_collective.h"
+#include "membed.h"
+#include "macros.h"
+#include "gmx_thread_affinity.h"
+#include "inputrec.h"
+
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+typedef struct {
+    gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_int64_t         deform_init_init_step_tpx;
+matrix              deform_init_box_tpx;
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
+#ifdef GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t    hw_opt;
+    FILE           *fplog;
+    t_commrec      *cr;
+    int             nfile;
+    const t_filenm *fnm;
+    output_env_t    oenv;
+    gmx_bool        bVerbose;
+    gmx_bool        bCompact;
+    int             nstglobalcomm;
+    ivec            ddxyz;
+    int             dd_node_order;
+    real            rdd;
+    real            rconstr;
+    const char     *dddlb_opt;
+    real            dlb_scale;
+    const char     *ddcsx;
+    const char     *ddcsy;
+    const char     *ddcsz;
+    const char     *nbpu_opt;
+    int             nstlist_cmdline;
+    gmx_int64_t     nsteps_cmdline;
+    int             nstepout;
+    int             resetstep;
+    int             nmultisim;
+    int             repl_ex_nst;
+    int             repl_ex_nex;
+    int             repl_ex_seed;
+    real            pforce;
+    real            cpt_period;
+    real            max_hours;
+    const char     *deviceOptions;
+    int             imdport;
+    unsigned long   Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                            that it's thread-local. This doesn't
+                                            copy pointed-to items, of course,
+                                            but those are all const. */
+    t_commrec *cr;                       /* we need a local version of this */
+    FILE      *fplog = NULL;
+    t_filenm  *fnm;
+
+    fnm = dup_tfn(mc.nfile, mc.fnm);
+
+    cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+    if (MASTER(cr))
+    {
+        fplog = mc.fplog;
+    }
+
+    mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+             mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+             mc.ddxyz, mc.dd_node_order, mc.rdd,
+             mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+             mc.ddcsx, mc.ddcsy, mc.ddcsz,
+             mc.nbpu_opt, mc.nstlist_cmdline,
+             mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+             mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+             mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.imdport, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+                                         gmx_bool bCompact, int nstglobalcomm,
+                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         const char *deviceOptions, unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->bCompact        = bCompact;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_node_order   = dd_node_order;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->deviceOptions   = deviceOptions;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                        const gmx_hw_opt_t  *hw_opt,
+                                        int                  nthreads_tot,
+                                        int                  ngpu)
+{
+    int nthreads_tmpi;
+
+    /* There are no separate PME nodes here, as we ensured in
+     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+     * and a conditional ensures we would not have ended up here.
+     * Note that separate PME nodes might be switched on later.
+     */
+    if (ngpu > 0)
+    {
+        nthreads_tmpi = ngpu;
+        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+        {
+            nthreads_tmpi = nthreads_tot;
+        }
+    }
+    else if (hw_opt->nthreads_omp > 0)
+    {
+        /* Here we could oversubscribe, when we do, we issue a warning later */
+        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+    }
+    else
+    {
+        /* TODO choose nthreads_omp based on hardware topology
+           when we have a hardware topology detection library */
+        /* In general, when running up to 4 threads, OpenMP should be faster.
+         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+         * even on two CPUs it's usually faster (but with many OpenMP threads
+         * it could be faster not to use HT, currently we always use HT).
+         * On Nehalem/Westmere we want to avoid running 16 threads over
+         * two CPUs with HT, so we need a limit<16; thus we use 12.
+         * A reasonable limit for Intel Sandy and Ivy bridge,
+         * not knowing the topology, is 16 threads.
+         * Below we check for Intel and AVX, which for now includes
+         * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
+         * model numbers we ensure also future Intel CPUs are covered.
+         */
+        const int nthreads_omp_always_faster             =  4;
+        const int nthreads_omp_always_faster_Nehalem     = 12;
+        const int nthreads_omp_always_faster_Intel_AVX   = 16;
+        gmx_bool  bIntelAVX;
+
+        bIntelAVX =
+            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+             gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX));
+
+        if (nthreads_tot <= nthreads_omp_always_faster ||
+            ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+             (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX)))
+        {
+            /* Use pure OpenMP parallelization */
+            nthreads_tmpi = 1;
+        }
+        else
+        {
+            /* Don't use OpenMP parallelization */
+            nthreads_tmpi = nthreads_tot;
+        }
+    }
+
+    return nthreads_tmpi;
+}
+
+
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
+ */
+static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+                            gmx_hw_opt_t *hw_opt,
+                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+                            const t_commrec *cr,
+                            FILE *fplog)
+{
+    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+    int      min_atoms_per_mpi_thread;
+    char    *env;
+    char     sbuf[STRLEN];
+    gmx_bool bCanUseGPU;
+
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        /* Trivial, return right away */
+        return hw_opt->nthreads_tmpi;
+    }
+
+    nthreads_hw = hwinfo->nthreads_hw_avail;
+
+    /* How many total (#tMPI*#OpenMP) threads can we start? */
+    if (hw_opt->nthreads_tot > 0)
+    {
+        nthreads_tot_max = hw_opt->nthreads_tot;
+    }
+    else
+    {
+        nthreads_tot_max = nthreads_hw;
+    }
+
+    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
+                  hwinfo->gpu_info.ncuda_dev_compatible > 0);
+    if (bCanUseGPU)
+    {
+        ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
+    }
+    else
+    {
+        ngpu = 0;
+    }
+
+    if (inputrec->cutoff_scheme == ecutsGROUP)
+    {
+        /* We checked this before, but it doesn't hurt to do it once more */
+        assert(hw_opt->nthreads_omp == 1);
+    }
+
+    nthreads_tmpi =
+        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+
+    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+    {
+        /* Dims/steps are divided over the nodes iso splitting the atoms */
+        min_atoms_per_mpi_thread = 0;
+    }
+    else
+    {
+        if (bCanUseGPU)
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+        }
+        else
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+        }
+    }
+
+    /* Check if an algorithm does not support parallel simulation.  */
+    if (nthreads_tmpi != 1 &&
+        ( inputrec->eI == eiLBFGS ||
+          inputrec->coulombtype == eelEWALD ) )
+    {
+        nthreads_tmpi = 1;
+
+        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+        {
+            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+        }
+    }
+    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+    {
+        /* the thread number was chosen automatically, but there are too many
+           threads (too few atoms per thread) */
+        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+
+        /* Avoid partial use of Hyper-Threading */
+        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+        {
+            nthreads_new = nthreads_hw/2;
+        }
+
+        /* Avoid large prime numbers in the thread count */
+        if (nthreads_new >= 6)
+        {
+            /* Use only 6,8,10 with additional factors of 2 */
+            int fac;
+
+            fac = 2;
+            while (3*fac*2 <= nthreads_new)
+            {
+                fac *= 2;
+            }
+
+            nthreads_new = (nthreads_new/fac)*fac;
+        }
+        else
+        {
+            /* Avoid 5 */
+            if (nthreads_new == 5)
+            {
+                nthreads_new = 4;
+            }
+        }
+
+        nthreads_tmpi = nthreads_new;
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
+        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+    }
+
+    return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxn_reference_nstlist = 10;
+/* The values to try when switching  */
+const int           nstlist_try[] = { 20, 25, 40 };
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/* Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlist_nstlist10, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    int                    nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with nstlist=10.
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = 10;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlist_nstlist10);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlist_nstlist10 + rlist_inc)*pow(listfac_ok, 1.0/3.0) - rlist_inc;
+    rlist_max = (rlist_nstlist10 + rlist_inc)*pow(listfac_max, 1.0/3.0) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+        ir->rlistlong = rlist_new;
+    }
+}
+
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+            ir->rlistlong = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU);
+    }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+                                     t_inputrec *ir,
+                                     gmx_mtop_t *mtop, real box_vol)
+{
+    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+
+    ir->cutoff_scheme = ecutsVERLET;
+    ir->verletbuf_tol = 0.005;
+
+    if (ir->rcoulomb != ir->rvdw)
+    {
+        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+    }
+
+    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+    {
+        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+    }
+    else if (ir_vdw_switched(ir) || ir_coulomb_switched(ir))
+    {
+        if (ir_vdw_switched(ir) && ir->vdw_modifier == eintmodNONE)
+        {
+            ir->vdwtype = evdwCUT;
+
+            switch (ir->vdwtype)
+            {
+                case evdwSHIFT:  ir->vdw_modifier = eintmodFORCESWITCH; break;
+                case evdwSWITCH: ir->vdw_modifier = eintmodPOTSWITCH; break;
+                default: gmx_fatal(FARGS, "The Verlet scheme does not support Van der Waals interactions of type '%s'", evdw_names[ir->vdwtype]);
+            }
+        }
+        if (ir_coulomb_switched(ir) && ir->coulomb_modifier == eintmodNONE)
+        {
+            if (EEL_FULL(ir->coulombtype))
+            {
+                /* With full electrostatic only PME can be switched */
+                ir->coulombtype      = eelPME;
+                ir->coulomb_modifier = eintmodPOTSHIFT;
+            }
+            else
+            {
+                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+                ir->coulombtype      = eelRF;
+                ir->epsilon_rf       = 0.0;
+                ir->coulomb_modifier = eintmodPOTSHIFT;
+            }
+        }
+
+        /* We set the pair energy error tolerance to a small number.
+         * Note that this is only for testing. For production the user
+         * should think about this and set the mdp options.
+         */
+        ir->verletbuf_tol = 1e-4;
+    }
+
+    if (inputrec2nboundeddim(ir) != 3)
+    {
+        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+    }
+
+    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+    {
+        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+    }
+
+    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        verletbuf_list_setup_t ls;
+
+        verletbuf_get_list_setup(TRUE, FALSE, &ls);
+        calc_verlet_buffer_size(mtop, box_vol, ir, -1, &ls, NULL, &ir->rlist);
+    }
+    else
+    {
+        real rlist_fac;
+
+        if (EI_MD(ir->eI))
+        {
+            rlist_fac       = 1 + verlet_buffer_ratio_NVE_T0;
+        }
+        else
+        {
+            rlist_fac       = 1 + verlet_buffer_ratio_nodynamics;
+        }
+        ir->verletbuf_tol   = -1;
+        ir->rlist           = rlist_fac*max(ir->rvdw, ir->rcoulomb);
+    }
+
+    gmx_mtop_remove_chargegroups(mtop);
+}
+
+static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
+{
+    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+            hw_opt->nthreads_tot,
+            hw_opt->nthreads_tmpi,
+            hw_opt->nthreads_omp,
+            hw_opt->nthreads_omp_pme,
+            hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+}
+
+/* Checks we can do when we don't (yet) know the cut-off scheme */
+static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
+                                      gmx_bool      bIsSimMaster)
+{
+    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+
+#ifndef GMX_THREAD_MPI
+    if (hw_opt->nthreads_tot > 0)
+    {
+        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+#endif
+
+#ifndef GMX_OPENMP
+    if (hw_opt->nthreads_omp > 1)
+    {
+        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
+    }
+    hw_opt->nthreads_omp = 1;
+#endif
+
+    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+    {
+        /* We have the same number of OpenMP threads for PP and PME processes,
+         * thus we can perform several consistency checks.
+         */
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+        }
+
+        if (hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+        {
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+    }
+
+#ifndef GMX_OPENMP
+    if (hw_opt->nthreads_omp > 1)
+    {
+        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+    }
+#endif
+
+    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+    {
+        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+    }
+
+    if (hw_opt->nthreads_tot == 1)
+    {
+        hw_opt->nthreads_tmpi = 1;
+
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    /* Parse GPU IDs, if provided.
+     * We check consistency with the tMPI thread count later.
+     */
+    gmx_parse_gpu_ids(&hw_opt->gpu_opt);
+
+#ifdef GMX_THREAD_MPI
+    if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+    {
+        /* Set the number of MPI threads equal to the number of GPUs */
+        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+
+        if (hw_opt->nthreads_tot > 0 &&
+            hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+        {
+            /* We have more GPUs than total threads requested.
+             * We choose to (later) generate a mismatch error,
+             * instead of launching more threads than requested.
+             */
+            hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+        }
+    }
+#endif
+
+    if (debug)
+    {
+        print_hw_opt(debug, hw_opt);
+    }
+}
+
+/* Checks we can do when we know the cut-off scheme */
+static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
+                                      int           cutoff_scheme)
+{
+    if (cutoff_scheme == ecutsGROUP)
+    {
+        /* We only have OpenMP support for PME only nodes */
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+                      ecutscheme_names[cutoff_scheme],
+                      ecutscheme_names[ecutsVERLET]);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    if (debug)
+    {
+        print_hw_opt(debug, hw_opt);
+    }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    char sbuf[STEPSTRSIZE];
+
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char stmp[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", stmp);
+    }
+}
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+static void free_gpu_resources(const t_forcerec *fr,
+                               const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On rank %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+             gmx_bool bCompact, int nstglobalcomm,
+             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             const char *deviceOptions, int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD, bCantUseGPU;
+    double                    nodetime = 0, realtime;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    real                      tmpr1, tmpr2;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    gmx_pme_t                *pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc;
+    char                     *gro;
+    gmx_wallcycle_t           wcycle;
+    gmx_bool                  bReadEkin;
+    int                       list;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    t_commrec                *cr_old       = cr;
+    int                       nthreads_pme = 1;
+    int                       nthreads_pp  = 1;
+    gmx_membed_t              membed       = NULL;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU      = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+    /* Rerun execution time is dominated by I/O and pair search, so
+     * GPUs are not very useful, plus they do not support more than
+     * one energy group. Don't select them when they can't be used,
+     * unless the user requested it, then fatal_error is called later.
+     *
+     * TODO it would be nice to notify the user that if this check
+     * causes GPUs not to be used that this is what is happening, and
+     * why, but that will be easier to do after some future
+     * cleanup. */
+    bCantUseGPU = bRerunMD && (inputrec->opts.ngener > 1);
+    bTryUseGPU  = bTryUseGPU && !(bCantUseGPU && !bForceUseGPU);
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+
+        if (inputrec->cutoff_scheme != ecutsVERLET &&
+            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+        {
+            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+        }
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_acceleration_supported(fplog, cr, inputrec, bUseGPU))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+                              "      (for quick performance testing you can use the -testverlet option)\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#ifdef GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+
+        if (inputrec->eI == eiSD2)
+        {
+            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
+                          "it is slower than integrator %s and is slightly less accurate\n"
+                          "with constraints. Use the %s integrator.",
+                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
+        }
+    }
+
+    /* Check for externally set OpenMP affinity and turn off internal
+     * pinning if any is found. We need to do this check early to tell
+     * thread-MPI whether it should do pinning when spawning threads.
+     * TODO: the above no longer holds, we should move these checks down
+     */
+    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr));
+
+    if (SIMMASTER(cr))
+    {
+#ifdef GMX_THREAD_MPI
+        /* Early check for externally set process affinity.
+         * With thread-MPI this is needed as pinning might get turned off,
+         * which needs to be known before starting thread-MPI.
+         * With thread-MPI hw_opt is processed here on the master rank
+         * and passed to the other ranks later, so we only do this on master.
+         */
+        gmx_check_thread_affinity_set(fplog,
+                                      NULL,
+                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
+#ifdef GMX_THREAD_MPI
+        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+#endif
+
+        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+            cr->npmenodes <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
+        }
+    }
+
+#ifdef GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog);
+        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, bCompact, nstglobalcomm,
+                                        ddxyz, dd_node_order, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours, deviceOptions,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    /* g_membed initialisation *
+     * Because we change the mtop, init_membed is called before the init_parallel *
+     * (in case we ever want to make it run in parallel) */
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+    }
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+                  "but the number of threads (option -nt) is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , ShortProgram()
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (cr->npmenodes > 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL,
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        cr->npmenodes = 0;
+    }
+
+    if (bUseGPU && cr->npmenodes < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        cr->npmenodes = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (DEFORM(*inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+        {
+            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                            cr, ddxyz,
+                            inputrec, state, &bReadEkin,
+                            (Flags & MD_APPENDFILES),
+                            (Flags & MD_APPENDFILESSET));
+
+            if (bReadEkin)
+            {
+                Flags |= MD_READ_EKIN;
+            }
+        }
+    }
+
+    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+#ifdef GMX_THREAD_MPI
+        /* With thread MPI only the master node/thread exists in mdrun.c,
+         * therefore non-master nodes need to open the "seppot" log file here.
+         */
+        || (!MASTER(cr) && (Flags & MD_SEPPOT))
+#endif
+        )
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+
+        make_dd_communicators(fplog, cr, dd_node_order);
+
+        /* Set overallocation to avoid frequent reallocation of arrays */
+        set_over_alloc_dd(TRUE);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#ifdef GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite Gromacs\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#ifdef GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.ncuda_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    /* TODO nthreads_pp is only used for pinning threads.
+     * This is a temporary solution until we have a hw topology library.
+     */
+    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tabletf", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* version for PCA_NOT_READ_NODE (see md.c) */
+        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+           "nofile","nofile","nofile","nofile",FALSE,pforce);
+         */
+        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (integrator[inputrec->eI].func == do_md)
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->ePull != epullNO)
+        {
+            /* Initialize pull code */
+            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
+                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+
+            setup_dd_grid(fplog, cr->dd);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+                                      oenv, bVerbose, bCompact,
+                                      nstglobalcomm,
+                                      vsite, constr,
+                                      nstepout, inputrec, mtop,
+                                      fcd, state,
+                                      mdatoms, nrnb, wcycle, ed, fr,
+                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                      membed,
+                                      cpt_period, max_hours,
+                                      deviceOptions,
+                                      imdport,
+                                      Flags,
+                                      walltime_accounting);
+
+        if (inputrec->ePull != epullNO)
+        {
+            finish_pull(inputrec->pull);
+        }
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+    }
+    else
+    {
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr);
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        sfree(membed);
+    }
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+#ifdef GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
diff --git a/patches/gromacs-5.1.2.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-5.1.2.diff/src/programs/mdrun/mdrun.cpp
index 626a3c002f342e3a948d0fd55b2dd8fd7d521ceb..d91e40b12a126c7ca60c1cede7912b7a0511aa00 100644
--- a/patches/gromacs-5.1.2.diff/src/programs/mdrun/mdrun.cpp
+++ b/patches/gromacs-5.1.2.diff/src/programs/mdrun/mdrun.cpp
@@ -575,12 +575,6 @@ int gmx_mdrun(int argc, char *argv[])
                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
                   pforce, cpt_period, max_hours, imdport, Flags);
 
-    /* PLUMED */
-    if(plumedswitch){
-      plumed_finalize(plumedmain);
-    }
-    /* END PLUMED */
-
     /* Log file has to be closed in mdrunner if we are appending to it
        (fplog not set here) */
     if (MASTER(cr) && !bDoAppendFiles)
diff --git a/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp b/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..398697eb4abb23cae573f7ced640bd882eba932f
--- /dev/null
+++ b/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp
@@ -0,0 +1,1363 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/legacyheaders/checkpoint.h"
+#include "gromacs/legacyheaders/constr.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/disre.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/gmx_thread_affinity.h"
+#include "gromacs/legacyheaders/inputrec.h"
+#include "gromacs/legacyheaders/main.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/oenv.h"
+#include "gromacs/legacyheaders/orires.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/sighandler.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+#include "resource-division.h"
+
+/* PLUMED */
+#include "../../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+typedef struct {
+    gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_int64_t         deform_init_init_step_tpx;
+matrix              deform_init_box_tpx;
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
+#ifdef GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t    hw_opt;
+    FILE           *fplog;
+    t_commrec      *cr;
+    int             nfile;
+    const t_filenm *fnm;
+    output_env_t    oenv;
+    gmx_bool        bVerbose;
+    gmx_bool        bCompact;
+    int             nstglobalcomm;
+    ivec            ddxyz;
+    int             dd_node_order;
+    real            rdd;
+    real            rconstr;
+    const char     *dddlb_opt;
+    real            dlb_scale;
+    const char     *ddcsx;
+    const char     *ddcsy;
+    const char     *ddcsz;
+    const char     *nbpu_opt;
+    int             nstlist_cmdline;
+    gmx_int64_t     nsteps_cmdline;
+    int             nstepout;
+    int             resetstep;
+    int             nmultisim;
+    int             repl_ex_nst;
+    int             repl_ex_nex;
+    int             repl_ex_seed;
+    real            pforce;
+    real            cpt_period;
+    real            max_hours;
+    int             imdport;
+    unsigned long   Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                            that it's thread-local. This doesn't
+                                            copy pointed-to items, of course,
+                                            but those are all const. */
+    t_commrec *cr;                       /* we need a local version of this */
+    FILE      *fplog = NULL;
+    t_filenm  *fnm;
+
+    fnm = dup_tfn(mc.nfile, mc.fnm);
+
+    cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+    if (MASTER(cr))
+    {
+        fplog = mc.fplog;
+    }
+
+    mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+             mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+             mc.ddxyz, mc.dd_node_order, mc.rdd,
+             mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+             mc.ddcsx, mc.ddcsy, mc.ddcsz,
+             mc.nbpu_opt, mc.nstlist_cmdline,
+             mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+             mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+             mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+                                         gmx_bool bCompact, int nstglobalcomm,
+                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->bCompact        = bCompact;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_node_order   = dd_node_order;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+#endif /* GMX_THREAD_MPI */
+
+
+/* We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxnReferenceNstlist = 10;
+/* The values to try when switching  */
+const int           nstlist_try[] = { 20, 25, 40 };
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/* Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    size_t                 nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+    const float            oneThird = 1.0f / 3.0f;
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with the reference value for nstlist (10).
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = nbnxnReferenceNstlist;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlistWithReferenceNstlist);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc;
+    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+        ir->rlistlong = rlist_new;
+    }
+}
+
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+            ir->rlistlong = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU);
+    }
+}
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+             gmx_bool bCompact, int nstglobalcomm,
+             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    struct gmx_pme_t        **pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
+    gmx_wallcycle_t           wcycle;
+    gmx_bool                  bReadEkin;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    int                       nthreads_pme = 1;
+    int                       nthreads_pp  = 1;
+    gmx_membed_t              membed       = NULL;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU      = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+    gmx_print_detected_hardware(fplog, cr, hwinfo);
+
+    if (fplog != NULL)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.n_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#ifdef GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+
+        if (inputrec->eI == eiSD2)
+        {
+            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
+                          "it is slower than integrator %s and is slightly less accurate\n"
+                          "with constraints. Use the %s integrator.",
+                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, cr);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(fplog, cr,
+                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+#ifdef GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog, bUseGPU);
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            t_commrec *cr_old       = cr;
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, bCompact, nstglobalcomm,
+                                        ddxyz, dd_node_order, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    /* g_membed initialisation *
+     * Because we change the mtop, init_membed is called before the init_parallel *
+     * (in case we ever want to make it run in parallel) */
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+    }
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+                  "but the number of threads (option -nt) is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (cr->npmenodes > 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL,
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        cr->npmenodes = 0;
+    }
+
+    if (bUseGPU && cr->npmenodes < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        cr->npmenodes = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (DEFORM(*inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+        {
+            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                            cr, ddxyz,
+                            inputrec, state, &bReadEkin,
+                            (Flags & MD_APPENDFILES),
+                            (Flags & MD_APPENDFILESSET));
+
+            if (bReadEkin)
+            {
+                Flags |= MD_READ_EKIN;
+            }
+        }
+    }
+
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+
+        make_dd_communicators(fplog, cr, dd_node_order);
+
+        /* Set overallocation to avoid frequent reallocation of arrays */
+        set_over_alloc_dd(TRUE);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#ifdef GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite GROMACS\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#ifdef GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update hw_opt for the number of MPI ranks */
+    check_and_update_hw_opt_3(hw_opt);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (integrator[inputrec->eI].func != do_tpi &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.n_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
+                                       cr, fplog);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    /* TODO nthreads_pp is only used for pinning threads.
+     * This is a temporary solution until we have a hw topology library.
+     */
+    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tabletf", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* version for PCA_NOT_READ_NODE (see md.c) */
+        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+           "nofile","nofile","nofile","nofile",FALSE,pforce);
+         */
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (integrator[inputrec->eI].func == do_md)
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
+                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+
+            setup_dd_grid(fplog, cr->dd);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+                                      oenv, bVerbose, bCompact,
+                                      nstglobalcomm,
+                                      vsite, constr,
+                                      nstepout, inputrec, mtop,
+                                      fcd, state,
+                                      mdatoms, nrnb, wcycle, ed, fr,
+                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                      membed,
+                                      cpt_period, max_hours,
+                                      imdport,
+                                      Flags,
+                                      walltime_accounting);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        sfree(membed);
+    }
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+    done_ed(&ed);
+
+#ifdef GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
diff --git a/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp.preplumed b/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..f436a01887bad6ced61a7159625c307df783a799
--- /dev/null
+++ b/patches/gromacs-5.1.2.diff/src/programs/mdrun/runner.cpp.preplumed
@@ -0,0 +1,1351 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/legacyheaders/checkpoint.h"
+#include "gromacs/legacyheaders/constr.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/disre.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/gmx_thread_affinity.h"
+#include "gromacs/legacyheaders/inputrec.h"
+#include "gromacs/legacyheaders/main.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/oenv.h"
+#include "gromacs/legacyheaders/orires.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/sighandler.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
+#include "resource-division.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+typedef struct {
+    gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_int64_t         deform_init_init_step_tpx;
+matrix              deform_init_box_tpx;
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
+#ifdef GMX_THREAD_MPI
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+
+struct mdrunner_arglist
+{
+    gmx_hw_opt_t    hw_opt;
+    FILE           *fplog;
+    t_commrec      *cr;
+    int             nfile;
+    const t_filenm *fnm;
+    output_env_t    oenv;
+    gmx_bool        bVerbose;
+    gmx_bool        bCompact;
+    int             nstglobalcomm;
+    ivec            ddxyz;
+    int             dd_node_order;
+    real            rdd;
+    real            rconstr;
+    const char     *dddlb_opt;
+    real            dlb_scale;
+    const char     *ddcsx;
+    const char     *ddcsy;
+    const char     *ddcsz;
+    const char     *nbpu_opt;
+    int             nstlist_cmdline;
+    gmx_int64_t     nsteps_cmdline;
+    int             nstepout;
+    int             resetstep;
+    int             nmultisim;
+    int             repl_ex_nst;
+    int             repl_ex_nex;
+    int             repl_ex_seed;
+    real            pforce;
+    real            cpt_period;
+    real            max_hours;
+    int             imdport;
+    unsigned long   Flags;
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+   arguments from its one argument and calls mdrunner(), after making
+   a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                            that it's thread-local. This doesn't
+                                            copy pointed-to items, of course,
+                                            but those are all const. */
+    t_commrec *cr;                       /* we need a local version of this */
+    FILE      *fplog = NULL;
+    t_filenm  *fnm;
+
+    fnm = dup_tfn(mc.nfile, mc.fnm);
+
+    cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+    if (MASTER(cr))
+    {
+        fplog = mc.fplog;
+    }
+
+    mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+             mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+             mc.ddxyz, mc.dd_node_order, mc.rdd,
+             mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+             mc.ddcsx, mc.ddcsy, mc.ddcsz,
+             mc.nbpu_opt, mc.nstlist_cmdline,
+             mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+             mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+             mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+   for each thread.
+   All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+                                         FILE *fplog, t_commrec *cr, int nfile,
+                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+                                         gmx_bool bCompact, int nstglobalcomm,
+                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+                                         const char *dddlb_opt, real dlb_scale,
+                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+                                         const char *nbpu_opt, int nstlist_cmdline,
+                                         gmx_int64_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
+                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                         real pforce, real cpt_period, real max_hours,
+                                         unsigned long Flags)
+{
+    int                      ret;
+    struct mdrunner_arglist *mda;
+    t_commrec               *crn; /* the new commrec */
+    t_filenm                *fnmn;
+
+    /* first check whether we even need to start tMPI */
+    if (hw_opt->nthreads_tmpi < 2)
+    {
+        return cr;
+    }
+
+    /* a few small, one-time, almost unavoidable memory leaks: */
+    snew(mda, 1);
+    fnmn = dup_tfn(nfile, fnm);
+
+    /* fill the data structure to pass as void pointer to thread start fn */
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt          = *hw_opt;
+    mda->fplog           = fplog;
+    mda->cr              = cr;
+    mda->nfile           = nfile;
+    mda->fnm             = fnmn;
+    mda->oenv            = oenv;
+    mda->bVerbose        = bVerbose;
+    mda->bCompact        = bCompact;
+    mda->nstglobalcomm   = nstglobalcomm;
+    mda->ddxyz[XX]       = ddxyz[XX];
+    mda->ddxyz[YY]       = ddxyz[YY];
+    mda->ddxyz[ZZ]       = ddxyz[ZZ];
+    mda->dd_node_order   = dd_node_order;
+    mda->rdd             = rdd;
+    mda->rconstr         = rconstr;
+    mda->dddlb_opt       = dddlb_opt;
+    mda->dlb_scale       = dlb_scale;
+    mda->ddcsx           = ddcsx;
+    mda->ddcsy           = ddcsy;
+    mda->ddcsz           = ddcsz;
+    mda->nbpu_opt        = nbpu_opt;
+    mda->nstlist_cmdline = nstlist_cmdline;
+    mda->nsteps_cmdline  = nsteps_cmdline;
+    mda->nstepout        = nstepout;
+    mda->resetstep       = resetstep;
+    mda->nmultisim       = nmultisim;
+    mda->repl_ex_nst     = repl_ex_nst;
+    mda->repl_ex_nex     = repl_ex_nex;
+    mda->repl_ex_seed    = repl_ex_seed;
+    mda->pforce          = pforce;
+    mda->cpt_period      = cpt_period;
+    mda->max_hours       = max_hours;
+    mda->Flags           = Flags;
+
+    /* now spawn new threads that start mdrunner_start_fn(), while
+       the main thread returns, we set thread affinity later */
+    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+                       mdrunner_start_fn, (void*)(mda) );
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    crn = reinitialize_commrec_for_this_thread(cr);
+    return crn;
+}
+
+#endif /* GMX_THREAD_MPI */
+
+
+/* We determine the extra cost of the non-bonded kernels compared to
+ * a reference nstlist value of 10 (which is the default in grompp).
+ */
+static const int    nbnxnReferenceNstlist = 10;
+/* The values to try when switching  */
+const int           nstlist_try[] = { 20, 25, 40 };
+#define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
+/* Increase nstlist until the non-bonded cost increases more than listfac_ok,
+ * but never more than listfac_max.
+ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
+ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
+ * Note that both CPU and GPU factors are conservative. Performance should
+ * not go down due to this tuning, except with a relatively slow GPU.
+ * On the other hand, at medium/high parallelization or with fast GPUs
+ * nstlist will not be increased enough to reach optimal performance.
+ */
+/* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+static const float  nbnxn_cpu_listfac_ok    = 1.05;
+static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+static const float  nbnxn_gpu_listfac_ok    = 1.20;
+static const float  nbnxn_gpu_listfac_max   = 1.30;
+
+/* Try to increase nstlist when using the Verlet cut-off scheme */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+                             t_inputrec *ir, int nstlist_cmdline,
+                             const gmx_mtop_t *mtop, matrix box,
+                             gmx_bool bGPU)
+{
+    float                  listfac_ok, listfac_max;
+    int                    nstlist_orig, nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
+    real                   rlist_new, rlist_prev;
+    size_t                 nstlist_ind = 0;
+    t_state                state_tmp;
+    gmx_bool               bBox, bDD, bCont;
+    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
+    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
+    const char            *box_err  = "Can not increase nstlist because the box is too small";
+    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
+    char                   buf[STRLEN];
+    const float            oneThird = 1.0f / 3.0f;
+
+    if (nstlist_cmdline <= 0)
+    {
+        if (ir->nstlist == 1)
+        {
+            /* The user probably set nstlist=1 for a reason,
+             * don't mess with the settings.
+             */
+            return;
+        }
+
+        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
+        {
+            fprintf(fp, nstl_gpu, ir->nstlist);
+        }
+        nstlist_ind = 0;
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        {
+            nstlist_ind++;
+        }
+        if (nstlist_ind == NNSTL)
+        {
+            /* There are no larger nstlist value to try */
+            return;
+        }
+    }
+
+    if (EI_MD(ir->eI) && ir->etc == etcNO)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", nve_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", nve_err);
+        }
+
+        return;
+    }
+
+    if (ir->verletbuf_tol == 0 && bGPU)
+    {
+        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_tol < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n", vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n", vbd_err);
+        }
+
+        return;
+    }
+
+    if (bGPU)
+    {
+        listfac_ok  = nbnxn_gpu_listfac_ok;
+        listfac_max = nbnxn_gpu_listfac_max;
+    }
+    else
+    {
+        listfac_ok  = nbnxn_cpu_listfac_ok;
+        listfac_max = nbnxn_cpu_listfac_max;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (nstlist_cmdline > 0)
+    {
+        if (fp)
+        {
+            sprintf(buf, "Getting nstlist=%d from command line option",
+                    nstlist_cmdline);
+        }
+        ir->nstlist = nstlist_cmdline;
+    }
+
+    verletbuf_get_list_setup(TRUE, bGPU, &ls);
+
+    /* Allow rlist to make the list a given factor larger than the list
+     * would be with the reference value for nstlist (10).
+     */
+    nstlist_prev = ir->nstlist;
+    ir->nstlist  = nbnxnReferenceNstlist;
+    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
+                            &rlistWithReferenceNstlist);
+    ir->nstlist  = nstlist_prev;
+
+    /* Determine the pair list size increase due to zero interactions */
+    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
+                                              mtop->natoms/det(box));
+    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc;
+    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
+                rlist_inc, rlist_ok, rlist_max);
+    }
+
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (nstlist_cmdline <= 0)
+        {
+            ir->nstlist = nstlist_try[nstlist_ind];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box, state_tmp.box);
+            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+        }
+
+        if (debug)
+        {
+            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
+                    ir->nstlist, rlist_new, bBox, bDD);
+        }
+
+        bCont = FALSE;
+
+        if (nstlist_cmdline <= 0)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox        = TRUE;
+                bDD         = TRUE;
+            }
+        }
+
+        nstlist_ind++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig, ir->nstlist,
+                ir->rlist, rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "%s\n\n", buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "%s\n\n", buf);
+        }
+        ir->rlist     = rlist_new;
+        ir->rlistlong = rlist_new;
+    }
+}
+
+static void prepare_verlet_scheme(FILE                           *fplog,
+                                  t_commrec                      *cr,
+                                  t_inputrec                     *ir,
+                                  int                             nstlist_cmdline,
+                                  const gmx_mtop_t               *mtop,
+                                  matrix                          box,
+                                  gmx_bool                        bUseGPU)
+{
+    /* For NVE simulations, we will retain the initial list buffer */
+    if (EI_DYNAMICS(ir->eI) &&
+        ir->verletbuf_tol > 0 &&
+        !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real                   rlist_new;
+
+        /* Here we assume SIMD-enabled kernels are being used. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(TRUE, bUseGPU, &ls);
+
+        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
+
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist, rlist_new,
+                        ls.cluster_size_i, ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+            ir->rlistlong = rlist_new;
+        }
+    }
+
+    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
+    {
+        gmx_fatal(FARGS, "Can not set nstlist without %s",
+                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
+    }
+
+    if (EI_DYNAMICS(ir->eI))
+    {
+        /* Set or try nstlist values */
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU);
+    }
+}
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE            *fplog,
+                                    gmx_int64_t      nsteps_cmdline,
+                                    t_inputrec      *ir,
+                                    const t_commrec *cr)
+{
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char sbuf_steps[STEPSTRSIZE];
+        char sbuf_msg[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps),
+                    fabs(nsteps_cmdline*ir->delta_t));
+        }
+        else
+        {
+            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf_steps));
+        }
+
+        md_print_warn(cr, fplog, "%s\n", sbuf_msg);
+    }
+    else if (nsteps_cmdline < -2)
+    {
+        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
+                  nsteps_cmdline);
+    }
+    /* Do nothing if nsteps_cmdline == -2 */
+}
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog, t_commrec *cr, int nfile,
+             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+             gmx_bool bCompact, int nstglobalcomm,
+             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+             const char *dddlb_opt, real dlb_scale,
+             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+             const char *nbpu_opt, int nstlist_cmdline,
+             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
+             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+             int imdport, unsigned long Flags)
+{
+    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
+    t_inputrec               *inputrec;
+    t_state                  *state = NULL;
+    matrix                    box;
+    gmx_ddbox_t               ddbox = {0};
+    int                       npme_major, npme_minor;
+    t_nrnb                   *nrnb;
+    gmx_mtop_t               *mtop          = NULL;
+    t_mdatoms                *mdatoms       = NULL;
+    t_forcerec               *fr            = NULL;
+    t_fcdata                 *fcd           = NULL;
+    real                      ewaldcoeff_q  = 0;
+    real                      ewaldcoeff_lj = 0;
+    struct gmx_pme_t        **pmedata       = NULL;
+    gmx_vsite_t              *vsite         = NULL;
+    gmx_constr_t              constr;
+    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
+    gmx_wallcycle_t           wcycle;
+    gmx_bool                  bReadEkin;
+    gmx_walltime_accounting_t walltime_accounting = NULL;
+    int                       rc;
+    gmx_int64_t               reset_counters;
+    gmx_edsam_t               ed           = NULL;
+    int                       nthreads_pme = 1;
+    int                       nthreads_pp  = 1;
+    gmx_membed_t              membed       = NULL;
+    gmx_hw_info_t            *hwinfo       = NULL;
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool                  bUseGPU      = FALSE;
+
+    /* CAUTION: threads may be started later on in this function, so
+       cr doesn't reflect the final parallel state right now */
+    snew(inputrec, 1);
+    snew(mtop, 1);
+
+    if (Flags & MD_APPENDFILES)
+    {
+        fplog = NULL;
+    }
+
+    bRerunMD     = (Flags & MD_RERUN);
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+    /* Detect hardware, gather information. This is an operation that is
+     * global for this process (MPI rank). */
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
+
+    gmx_print_detected_hardware(fplog, cr, hwinfo);
+
+    if (fplog != NULL)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Abraham2015");
+        please_cite(fplog, "Pall2015");
+        please_cite(fplog, "Pronk2013");
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+
+    snew(state, 1);
+    if (SIMMASTER(cr))
+    {
+        /* Read (nearly) all data required for the simulation */
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            /* TODO add GPU kernels for this and replace this check by:
+             * (bUseGPU && (ir->vdwtype == evdwPME &&
+             *               ir->ljpme_combination_rule == eljpmeLB))
+             * update the message text and the content of nbnxn_acceleration_supported.
+             */
+            if (bUseGPU &&
+                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
+            {
+                /* Fallback message printed by nbnxn_acceleration_supported */
+                if (bForceUseGPU)
+                {
+                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
+                }
+                bUseGPU = FALSE;
+            }
+
+            prepare_verlet_scheme(fplog, cr,
+                                  inputrec, nstlist_cmdline, mtop, state->box,
+                                  bUseGPU);
+        }
+        else
+        {
+            if (nstlist_cmdline > 0)
+            {
+                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
+            }
+
+            if (hwinfo->gpu_info.n_dev_compatible > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
+            }
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+
+#ifdef GMX_TARGET_BGQ
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+#endif
+        }
+
+        if (inputrec->eI == eiSD2)
+        {
+            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
+                          "it is slower than integrator %s and is slightly less accurate\n"
+                          "with constraints. Use the %s integrator.",
+                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
+        }
+    }
+
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, cr);
+
+    /* Early check for externally set process affinity. */
+    gmx_check_thread_affinity_set(fplog, cr,
+                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+
+#ifdef GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
+        }
+
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog, bUseGPU);
+
+        if (hw_opt->nthreads_tmpi > 1)
+        {
+            t_commrec *cr_old       = cr;
+            /* now start the threads. */
+            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+                                        oenv, bVerbose, bCompact, nstglobalcomm,
+                                        ddxyz, dd_node_order, rdd, rconstr,
+                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+                                        nbpu_opt, nstlist_cmdline,
+                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+                                        cpt_period, max_hours,
+                                        Flags);
+            /* the main thread continues here with a new cr. We don't deallocate
+               the old cr because other threads may still be reading it. */
+            if (cr == NULL)
+            {
+                gmx_comm("Failed to spawn threads");
+            }
+        }
+    }
+#endif
+    /* END OF CAUTION: cr is now reliable */
+
+    /* g_membed initialisation *
+     * Because we change the mtop, init_membed is called before the init_parallel *
+     * (in case we ever want to make it run in parallel) */
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Initializing membed");
+        }
+        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+    }
+
+    if (PAR(cr))
+    {
+        /* now broadcast everything to the non-master nodes/threads: */
+        init_parallel(cr, inputrec, mtop);
+
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (fplog != NULL)
+    {
+        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
+    }
+
+    /* now make sure the state is initialized and propagated */
+    set_state_entries(state, inputrec);
+
+    /* A parallel command line option consistency check that we can
+       only do after any threads have started. */
+    if (!PAR(cr) &&
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+    {
+        gmx_fatal(FARGS,
+                  "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+                  "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+                  "but the number of threads (option -nt) is 1"
+#else
+                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
+#endif
+#endif
+                  , output_env_get_program_display_name(oenv)
+                  );
+    }
+
+    if (bRerunMD &&
+        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    {
+        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+    }
+
+    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
+    }
+
+    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
+    {
+        if (cr->npmenodes > 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL,
+                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
+        }
+
+        cr->npmenodes = 0;
+    }
+
+    if (bUseGPU && cr->npmenodes < 0)
+    {
+        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+         * improve performance with many threads per GPU, since our OpenMP
+         * scaling is bad, but it's difficult to automate the setup.
+         */
+        cr->npmenodes = 0;
+    }
+
+#ifdef GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+    }
+#endif
+
+    /* NMR restraints must be initialized before load_checkpoint,
+     * since with time averaging the history is added to t_state.
+     * For proper consistency check we therefore need to extend
+     * t_state here.
+     * So the PME-only nodes (if present) will also initialize
+     * the distance restraints.
+     */
+    snew(fcd, 1);
+
+    /* This needs to be called before read_checkpoint to extend the state */
+    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);
+
+    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
+                state);
+
+    if (DEFORM(*inputrec))
+    {
+        /* Store the deform reference box before reading the checkpoint */
+        if (SIMMASTER(cr))
+        {
+            copy_mat(state->box, box);
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast(sizeof(box), box, cr);
+        }
+        /* Because we do not have the update struct available yet
+         * in which the reference values should be stored,
+         * we store them temporarily in static variables.
+         * This should be thread safe, since they are only written once
+         * and with identical values.
+         */
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+        deform_init_init_step_tpx = inputrec->init_step;
+        copy_mat(box, deform_init_box_tpx);
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+    }
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        /* Check if checkpoint file exists before doing continuation.
+         * This way we can use identical input options for the first and subsequent runs...
+         */
+        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+        {
+            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                            cr, ddxyz,
+                            inputrec, state, &bReadEkin,
+                            (Flags & MD_APPENDFILES),
+                            (Flags & MD_APPENDFILESSET));
+
+            if (bReadEkin)
+            {
+                Flags |= MD_READ_EKIN;
+            }
+        }
+    }
+
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
+                     Flags, &fplog);
+    }
+
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+    if (SIMMASTER(cr))
+    {
+        copy_mat(state->box, box);
+    }
+
+    if (PAR(cr))
+    {
+        gmx_bcast(sizeof(box), box, cr);
+    }
+
+    /* Essential dynamics */
+    if (opt2bSet("-ei", nfile, fnm))
+    {
+        /* Open input and output files, allocate space for ED data structure */
+        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+    }
+
+    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
+                     inputrec->eI == eiNM))
+    {
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+                                           dddlb_opt, dlb_scale,
+                                           ddcsx, ddcsy, ddcsz,
+                                           mtop, inputrec,
+                                           box, state->x,
+                                           &ddbox, &npme_major, &npme_minor);
+
+        make_dd_communicators(fplog, cr, dd_node_order);
+
+        /* Set overallocation to avoid frequent reallocation of arrays */
+        set_over_alloc_dd(TRUE);
+    }
+    else
+    {
+        /* PME, if used, is done on all nodes with 1D decomposition */
+        cr->npmenodes = 0;
+        cr->duty      = (DUTY_PP | DUTY_PME);
+        npme_major    = 1;
+        npme_minor    = 1;
+
+        if (inputrec->ePBC == epbcSCREW)
+        {
+            gmx_fatal(FARGS,
+                      "pbc=%s is only implemented with domain decomposition",
+                      epbc_names[inputrec->ePBC]);
+        }
+    }
+
+    if (PAR(cr))
+    {
+        /* After possible communicator splitting in make_dd_communicators.
+         * we can set up the intra/inter node communication.
+         */
+        gmx_setup_nodecomm(fplog, cr);
+    }
+
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
+#ifdef GMX_MPI
+    if (MULTISIM(cr))
+    {
+        md_print_info(cr, fplog,
+                      "This is simulation %d out of %d running as a composite GROMACS\n"
+                      "multi-simulation job. Setup for this simulation:\n\n",
+                      cr->ms->sim, cr->ms->nsim);
+    }
+    md_print_info(cr, fplog, "Using %d MPI %s\n",
+                  cr->nnodes,
+#ifdef GMX_THREAD_MPI
+                  cr->nnodes == 1 ? "thread" : "threads"
+#else
+                  cr->nnodes == 1 ? "process" : "processes"
+#endif
+                  );
+    fflush(stderr);
+#endif
+
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
+    /* Check and update hw_opt for the number of MPI ranks */
+    check_and_update_hw_opt_3(hw_opt);
+
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+#ifndef NDEBUG
+    if (integrator[inputrec->eI].func != do_tpi &&
+        inputrec->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_feenableexcept();
+    }
+#endif
+
+    if (bUseGPU)
+    {
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+    else
+    {
+        /* Ignore (potentially) manually selected GPUs */
+        hw_opt->gpu_opt.n_dev_use = 0;
+    }
+
+    /* check consistency across ranks of things like SIMD
+     * support and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+    /* Now that we know the setup is consistent, check for efficiency */
+    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
+                                       cr, fplog);
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is
+       identical everywhere;
+     */
+    /* TODO nthreads_pp is only used for pinning threads.
+     * This is a temporary solution until we have a hw topology library.
+     */
+    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+    if (PAR(cr))
+    {
+        /* Master synchronizes its value of reset_counters with all nodes
+         * including PME only nodes */
+        reset_counters = wcycle_get_reset_counters(wcycle);
+        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+        wcycle_set_reset_counters(wcycle, reset_counters);
+    }
+
+    snew(nrnb, 1);
+    if (cr->duty & DUTY_PP)
+    {
+        bcast_state(cr, state);
+
+        /* Initiate forcerecord */
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
+        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+                      opt2fn("-table", nfile, fnm),
+                      opt2fn("-tabletf", nfile, fnm),
+                      opt2fn("-tablep", nfile, fnm),
+                      opt2fn("-tableb", nfile, fnm),
+                      nbpu_opt,
+                      FALSE,
+                      pforce);
+
+        /* version for PCA_NOT_READ_NODE (see md.c) */
+        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+           "nofile","nofile","nofile","nofile",FALSE,pforce);
+         */
+
+        /* Initialize QM-MM */
+        if (fr->bQMMM)
+        {
+            init_QMMMrec(cr, mtop, inputrec, fr);
+        }
+
+        /* Initialize the mdatoms structure.
+         * mdatoms is not filled with atom data,
+         * as this can not be done now with domain decomposition.
+         */
+        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+        /* Initialize the virtual site communication */
+        vsite = init_vsite(mtop, cr, FALSE);
+
+        calc_shifts(box, fr->shift_vec);
+
+        /* With periodic molecules the charge groups should be whole at start up
+         * and the virtual sites should not be far from their proper positions.
+         */
+        if (!inputrec->bContinuation && MASTER(cr) &&
+            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        {
+            /* Make molecules whole at start of run */
+            if (fr->ePBC != epbcNONE)
+            {
+                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+            }
+            if (vsite)
+            {
+                /* Correct initial vsite positions are required
+                 * for the initial distribution in the domain decomposition
+                 * and for the initial shell prediction.
+                 */
+                construct_vsites_mtop(vsite, mtop, state->x);
+            }
+        }
+
+        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
+        {
+            ewaldcoeff_q  = fr->ewaldcoeff_q;
+            ewaldcoeff_lj = fr->ewaldcoeff_lj;
+            pmedata       = &fr->pmedata;
+        }
+        else
+        {
+            pmedata = NULL;
+        }
+    }
+    else
+    {
+        /* This is a PME only node */
+
+        /* We don't need the state */
+        done_state(state);
+
+        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
+        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
+        snew(pmedata, 1);
+    }
+
+    if (hw_opt->thread_affinity != threadaffOFF)
+    {
+        /* Before setting affinity, check whether the affinity has changed
+         * - which indicates that probably the OpenMP library has changed it
+         * since we first checked).
+         */
+        gmx_check_thread_affinity_set(fplog, cr,
+                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+        /* Set the CPU affinity */
+        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+    }
+
+    /* Initiate PME if necessary,
+     * either on all nodes or on dedicated PME nodes only. */
+    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
+    {
+        if (mdatoms)
+        {
+            nChargePerturbed = mdatoms->nChargePerturbed;
+            if (EVDW_PME(inputrec->vdwtype))
+            {
+                nTypePerturbed   = mdatoms->nTypePerturbed;
+            }
+        }
+        if (cr->npmenodes > 0)
+        {
+            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
+            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+        }
+
+        if (cr->duty & DUTY_PME)
+        {
+            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
+                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+            if (status != 0)
+            {
+                gmx_fatal(FARGS, "Error %d initializing PME", status);
+            }
+        }
+    }
+
+
+    if (integrator[inputrec->eI].func == do_md)
+    {
+        /* Turn on signal handling on all nodes */
+        /*
+         * (A user signal from the PME nodes (if any)
+         * is communicated to the PP nodes.
+         */
+        signal_handler_install();
+    }
+
+    if (cr->duty & DUTY_PP)
+    {
+        /* Assumes uniform use of the number of OpenMP threads */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+        if (inputrec->bPull)
+        {
+            /* Initialize pull code */
+            inputrec->pull_work =
+                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
+                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
+                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+        }
+
+        if (inputrec->bRot)
+        {
+            /* Initialize enforced rotation code */
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+                     bVerbose, Flags);
+        }
+
+        if (inputrec->eSwapCoords != eswapNO)
+        {
+            /* Initialize ion swapping code */
+            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
+                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
+        }
+
+        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+        if (DOMAINDECOMP(cr))
+        {
+            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+
+            setup_dd_grid(fplog, cr->dd);
+        }
+
+        /* Now do whatever the user wants us to do (how flexible...) */
+        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+                                      oenv, bVerbose, bCompact,
+                                      nstglobalcomm,
+                                      vsite, constr,
+                                      nstepout, inputrec, mtop,
+                                      fcd, state,
+                                      mdatoms, nrnb, wcycle, ed, fr,
+                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                      membed,
+                                      cpt_period, max_hours,
+                                      imdport,
+                                      Flags,
+                                      walltime_accounting);
+
+        if (inputrec->bPull)
+        {
+            finish_pull(inputrec->pull_work);
+        }
+
+        if (inputrec->bRot)
+        {
+            finish_rot(inputrec->rot);
+        }
+
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
+        /* do PME only */
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+
+    /* Finish up, write some stuff
+     * if rerunMD, don't write last frame again
+     */
+    finish_run(fplog, cr,
+               inputrec, nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv : NULL,
+               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+
+    /* Free GPU memory and context */
+    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);
+
+    if (opt2bSet("-membed", nfile, fnm))
+    {
+        sfree(membed);
+    }
+
+    gmx_hardware_info_free(hwinfo);
+
+    /* Does what it says */
+    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
+    walltime_accounting_destroy(walltime_accounting);
+
+    /* Close logfile already here if we were appending to it */
+    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+    {
+        gmx_log_close(fplog);
+    }
+
+    rc = (int)gmx_get_stop_condition();
+
+    done_ed(&ed);
+
+#ifdef GMX_THREAD_MPI
+    /* we need to join all threads. The sub-threads join when they
+       exit this function, but the master thread needs to be told to
+       wait for that. */
+    if (PAR(cr) && MASTER(cr))
+    {
+        tMPI_Finalize();
+    }
+#endif
+
+    return rc;
+}
diff --git a/regtest/analysis/rt-pca-2/Makefile b/regtest/analysis/rt-pca-2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3703b27cea227aa053fb6d1d73f861e4384dbcee
--- /dev/null
+++ b/regtest/analysis/rt-pca-2/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/analysis/rt-pca-2/colvar.in b/regtest/analysis/rt-pca-2/colvar.in
new file mode 100644
index 0000000000000000000000000000000000000000..3c1a6d4ff8bbca15ba5cc7bc12587585b89d3cbb
--- /dev/null
+++ b/regtest/analysis/rt-pca-2/colvar.in
@@ -0,0 +1,1001 @@
+#! FIELDS time cv1 cv2
+0 0.775540714036 0.114424044242
+1 0.875241253264 1.26867389701
+2 1.5823516279 0.605481219611
+3 -1.96997844317 -3.19169418421
+4 1.46732571381 0.768488126284
+5 0.666346341751 -0.470501561902
+6 0.860037532851 -0.407265735903
+7 0.356107752262 0.578949081633
+8 1.29856933615 0.877672309318
+9 -1.91795493292 -1.06303227781
+10 0.503769991835 -1.60084848817
+11 0.654256109546 0.897859999363
+12 1.02289470749 0.298766057397
+13 0.513653990173 -0.828578832591
+14 0.728689851987 0.386875652092
+15 0.897535288431 -0.102913855511
+16 3.05937675092 2.06078844086
+17 -0.250090711503 0.97082781137
+18 0.767548438193 0.9917571357
+19 3.04721203027 1.95327665343
+20 0.883247152601 1.42758236937
+21 -0.0440969219802 0.175247477206
+22 -0.584719212221 -1.27184824226
+23 3.41759109718 0.796329037698
+24 3.13938674369 1.26821644297
+25 1.2190377829 0.103134012441
+26 1.0466239386 0.986404889943
+27 0.180082423161 -1.47317617169
+28 1.70132536395 1.57912581146
+29 -1.67316758635 0.457524118724
+30 0.419583577658 0.419002131786
+31 2.08908425542 1.74820816361
+32 0.365257968572 1.96793840588
+33 -0.0497651715967 0.381983401933
+34 -0.236074748852 -0.187896968959
+35 1.55492737247 -0.439098734545
+36 1.5383684657 1.17171100139
+37 -0.139897090906 -0.23738519017
+38 0.770989321462 -0.938496396415
+39 0.451017119855 0.842232510705
+40 1.91640601869 0.832057382667
+41 -0.594572922964 -0.733158693988
+42 0.444401447116 0.244228800796
+43 1.38590200976 1.83739003623
+44 0.663913596737 0.127638758917
+45 -0.500647526898 -1.6570969453
+46 0.706239344415 2.05086674122
+47 0.578127400432 0.787921845244
+48 2.2275525669 0.368161450454
+49 0.906842668518 0.95581995319
+50 -1.22571853845 -1.32136949323
+51 -0.275089607594 -0.308775809186
+52 0.432543013028 0.352939734773
+53 -0.265613356155 0.685579682129
+54 -0.026500469991 -1.46371851512
+55 -2.25635311888 -0.159184803471
+56 -1.05132726639 -2.40098362963
+57 2.89640541158 0.547566861808
+58 -1.14344481238 -1.68926858478
+59 1.45238235659 1.25365664484
+60 3.08497160853 1.7120033581
+61 1.65205641262 0.0870512387052
+62 1.96225185285 1.2023227004
+63 2.12967824398 1.54646668897
+64 -1.05958054818 -1.64242634245
+65 1.58963629316 1.12989953507
+66 1.32938987432 0.0211976858831
+67 0.802707269487 -0.735851517591
+68 1.44968642067 -1.12289769058
+69 1.59035697882 2.28786735322
+70 2.53014104202 1.24813646371
+71 -1.53570128588 -1.92600970779
+72 1.08745452242 -1.27355083636
+73 1.30117182562 0.787026441902
+74 -0.77975601089 -0.330799732548
+75 0.409944088865 -0.619570646354
+76 0.271191647328 -0.327035449609
+77 1.55707575036 1.27106691454
+78 -0.29521951328 -0.918247433415
+79 0.738422452017 -0.0639102857288
+80 1.75945417231 -0.806567559302
+81 1.30944276799 0.490682166354
+82 -1.32208025015 -1.9741189554
+83 -0.983868566041 -0.675037269068
+84 -2.41155796614 0.0106880240482
+85 1.59584562215 1.04628176079
+86 1.24299574254 -0.156556771096
+87 -0.349106141641 0.343122029732
+88 3.42302650979 1.83306682365
+89 2.09504205893 0.715062412482
+90 1.3864990786 -0.848122023448
+91 0.114438193308 0.838407668496
+92 0.359234528506 0.473327864691
+93 2.72262347992 2.13341275326
+94 1.29480801872 0.223921347775
+95 -1.44095653035 -2.78622391763
+96 1.82885082976 0.465991778531
+97 0.0650202816573 -0.52176986029
+98 1.4826449428 -0.531661924194
+99 2.91669366438 1.1533975579
+100 -2.27351051187 -3.54020281492
+101 -0.934723496529 1.00579340162
+102 0.373440323787 1.80485887534
+103 0.0703649761202 0.831433338352
+104 2.81895503432 0.0675261343631
+105 1.00076332094 -0.426258776056
+106 1.48402135963 2.19476881944
+107 2.88990012631 2.88047328603
+108 1.70043111576 -0.0604897530187
+109 3.91193812988 2.41620634795
+110 2.86834529622 2.32988866475
+111 1.72788726525 2.59407176217
+112 2.37246624358 0.930990165063
+113 0.147200669425 -1.81465080644
+114 0.263554041463 2.1627959981
+115 0.754048313019 1.46810596294
+116 1.59864025054 0.140166238336
+117 -0.695358343263 -0.881988413021
+118 3.3029529571 1.36885697998
+119 -0.023437088948 0.82956505666
+120 1.56563256871 1.01221804778
+121 0.0629390870554 -0.257050993644
+122 2.36875663895 1.15524606522
+123 2.11926689893 2.18682127227
+124 1.37724243614 0.618075497951
+125 1.16567400959 2.42530403584
+126 0.612519978649 -0.21893524974
+127 -0.35248272377 -0.0975479513445
+128 1.01140013077 0.342684499755
+129 1.62422907251 1.75542981328
+130 0.0914719882537 -0.910093617506
+131 2.69028058582 1.90389142139
+132 0.519176583588 -1.04607407774
+133 1.91520164602 -0.115862015663
+134 -0.414102159498 -1.23710080445
+135 1.86430676851 -0.0376822598894
+136 3.63132905991 2.46736311543
+137 1.24714839944 1.11904957832
+138 1.7283696712 1.1531693112
+139 -0.115520894051 -1.80600638047
+140 1.57671760911 2.99955859278
+141 2.12997222413 1.78612083307
+142 0.111351590629 0.405688932784
+143 -0.0645354718478 0.841667853091
+144 0.217233517861 0.350182424433
+145 -0.0146872312086 0.323894416193
+146 0.0394414239506 -0.946620767282
+147 -0.411764311248 -0.311196782931
+148 -0.863345411069 -1.01756205589
+149 0.684865768077 -0.367923621622
+150 0.21353493243 -0.0956576402036
+151 1.50416321319 1.14379539237
+152 2.88703332958 1.31051490726
+153 -0.960338043351 -1.66814020872
+154 -0.492284025077 0.0832615150324
+155 -0.928634693707 -1.16821407272
+156 -2.59191513331 -1.28544171723
+157 0.515602333124 0.272054532867
+158 4.25028349028 3.507667131
+159 0.0594299650903 0.818673971616
+160 2.63074099874 2.03620436428
+161 -1.0640535519 -1.92207789074
+162 3.1125293201 1.54259626731
+163 -0.49110917227 -2.17484935256
+164 1.70815957187 -0.100510436263
+165 0.682852254931 0.749829411678
+166 -0.681280359108 1.31300620317
+167 1.61670909287 0.784707727399
+168 1.14411540609 -0.0348052682116
+169 -0.0395965296065 0.142132112801
+170 0.983407074316 0.106888185229
+171 1.42370381706 -0.106199787771
+172 1.90055022213 1.45027686125
+173 4.6825676146 4.11270684163
+174 0.967944326868 1.29911582792
+175 1.62626113517 1.01357338884
+176 0.0250152151991 -0.635262684752
+177 2.70172666648 0.888489811227
+178 0.118916736713 0.413707288397
+179 -1.47129462582 -0.906153358896
+180 -0.929868777077 -0.29650049668
+181 0.713734440172 1.38545704433
+182 0.766609751549 0.599607761343
+183 0.755861919229 0.281242589585
+184 2.1581814869 1.52885299632
+185 1.54265937091 -0.996915709267
+186 -0.780607032956 -0.826969158462
+187 1.90833389883 0.457936075481
+188 0.74029790498 0.94103411897
+189 0.751299096605 -1.02883865917
+190 0.30213713357 -0.647657246138
+191 2.95921076522 0.694306578014
+192 3.00214453898 4.30361069765
+193 1.54660838906 2.3341555745
+194 0.340172194022 -0.0671323466137
+195 -0.874633904713 -1.07707508057
+196 4.04204484772 3.28872412068
+197 0.806915838458 -0.889212545126
+198 1.46230502674 0.950701059737
+199 2.13747966684 0.455682207011
+200 3.55919802275 2.51430238161
+201 -0.0144887733885 0.183480222719
+202 0.0324546538771 -0.72689453247
+203 3.40995509673 1.28101054183
+204 -0.763179246749 -0.492505206305
+205 1.96493800998 0.796615448806
+206 -0.704944271664 -1.27478887289
+207 0.690556750936 -0.708097082348
+208 0.773782607187 0.875528859869
+209 1.52765916482 0.0635660795492
+210 0.845377777251 0.203497019752
+211 -0.715523545915 0.589160981226
+212 1.51612287539 0.971050552539
+213 1.89593266002 -1.37143406738
+214 -1.4741800175 -1.16928816375
+215 -0.0647886701305 0.550125130952
+216 0.697289652694 -0.157679017555
+217 0.0573342236867 -0.819722294706
+218 0.799558305331 1.24347126912
+219 -2.05302682887 -1.29844246328
+220 -0.733310704173 0.242551002071
+221 2.12169970999 0.457606862708
+222 -0.0869489875983 -0.888846788791
+223 1.16539020143 -0.0506299817686
+224 2.04883289186 2.03628119297
+225 2.05378286147 1.96428637793
+226 0.352549173391 -0.736571699667
+227 -0.0958917366564 0.0130708628444
+228 0.869629669038 0.496611243565
+229 -0.0343217404299 1.63218101958
+230 0.523833306155 1.98300708493
+231 2.14575171275 2.1022971365
+232 1.39241877451 1.84694428478
+233 0.274934047958 0.853363558047
+234 -1.08474602359 -1.54638874523
+235 0.70245904624 0.293021841089
+236 2.59761330718 1.77132101732
+237 1.86539692206 0.218117830318
+238 0.249393244623 -1.1544026902
+239 0.530293489218 1.0598013229
+240 0.452134860455 -0.303388650397
+241 0.131995203315 1.26492684015
+242 1.78184112625 1.66568150653
+243 0.505786940864 -1.92771995867
+244 1.23278487459 -0.676747348331
+245 2.4462262619 0.316113766592
+246 1.86667900797 -0.206230891226
+247 0.0698985002469 -0.0745302107041
+248 -1.0493115203 -1.73601638326
+249 1.32908950397 -0.480643862775
+250 2.584215445 -1.40886078758
+251 0.0934909066951 -0.213251283186
+252 0.81161839983 0.199604526541
+253 3.26141584822 3.49545625188
+254 -0.0225541598693 -1.08322805373
+255 2.84096921521 2.73595573635
+256 0.951381899166 -0.012846413267
+257 0.289076285186 0.296471327373
+258 0.201232559406 -0.284346060192
+259 0.259305343806 0.0229188879366
+260 0.903024005931 0.403651085692
+261 -0.136681526264 -0.418468534663
+262 1.83514473355 1.01072132381
+263 -0.236933132897 0.203188325015
+264 -1.42682510598 -0.0604871537797
+265 1.0990673161 0.549232799254
+266 1.00717608158 1.11046306959
+267 1.70528012477 1.315990044
+268 1.52158082967 1.70578848698
+269 1.75839111063 -0.0778391452926
+270 -0.671478012149 -1.12574361467
+271 0.365565862885 0.160546255789
+272 1.50215652036 1.44263078376
+273 -0.887036436076 -1.03585563721
+274 2.21695335334 3.81710888486
+275 1.13684346944 0.215697352326
+276 2.52481319476 2.55195149733
+277 1.13953692191 1.06636430846
+278 2.38514883977 0.762612566466
+279 -0.687484433714 -0.319608372001
+280 0.63593618339 -1.24079753909
+281 0.597706891687 0.258529950414
+282 4.01619056311 3.03951102339
+283 0.550505169525 -0.0688245535789
+284 4.28776205841 2.61114736721
+285 1.51197738334 0.0264910003122
+286 2.60870913293 2.13476564203
+287 1.61813007624 0.147874947905
+288 2.29463984573 0.778094612308
+289 -0.682858381366 -0.325895661707
+290 0.306487376882 1.59827744577
+291 2.4888497043 1.00790786632
+292 -0.359365104705 -1.05866143453
+293 0.581951540681 -0.455290021178
+294 0.714536330703 0.197380961064
+295 2.30643681046 0.691396150557
+296 1.04922924132 0.650134453588
+297 2.43584493409 1.16613113311
+298 2.64308549025 2.44286689195
+299 2.61152475015 1.21441329604
+300 1.9218105291 1.23289871385
+301 0.0948256352664 -1.75567976615
+302 -1.36984642979 -2.73882963234
+303 1.73787895669 -0.193187577553
+304 1.39168118344 -0.295159570309
+305 0.431920647433 -0.618871013126
+306 1.8344545771 0.214550409659
+307 3.66171946394 0.655669486125
+308 1.71687723612 1.77025752681
+309 0.684362512946 1.95018476561
+310 1.84432520827 2.43309311701
+311 1.1549195534 0.298463033981
+312 2.80056988796 1.89346749203
+313 0.466838169967 -1.22778215326
+314 0.186882003361 0.516035379124
+315 -0.0188114444643 -0.385504513194
+316 2.29633686482 0.763966204079
+317 0.185988474554 0.26290054327
+318 2.44497169506 3.67677629428
+319 -1.15246316348 -0.244220510524
+320 0.379827367145 0.721125265413
+321 -0.419585909823 -1.57231431368
+322 1.59984981343 0.212070869
+323 2.20663417997 2.39787858999
+324 0.599290451168 -0.426016374591
+325 -0.462885658275 -1.72029218
+326 0.895693633802 0.26861909088
+327 4.08522032128 1.78769715523
+328 2.47036356352 1.68808652941
+329 0.661135471044 3.07792849049
+330 2.39914987553 4.03450837073
+331 -0.902698794621 -0.0395743503755
+332 -0.234888091242 0.429935582538
+333 -1.24229668302 -0.891571315585
+334 2.38682363243 0.952680035706
+335 0.778988620514 -1.04764437369
+336 0.0290530737073 1.00037697255
+337 1.82948502843 -0.973601280486
+338 -0.255441771728 -0.0501472018501
+339 2.67981170401 2.00559795819
+340 2.82370753257 1.8894652539
+341 -0.727352458306 -3.34205907279
+342 0.0399296755205 0.27238308227
+343 1.71565955182 1.34259391332
+344 2.29197898238 2.50454593216
+345 -0.34239303414 -2.04174483763
+346 0.225520446188 -1.07967124149
+347 -0.174673652103 -1.37336504242
+348 -0.977031664524 0.272318038884
+349 1.87134448544 1.32560334521
+350 1.15539065529 -0.830321942086
+351 0.0291670744791 -2.81384199247
+352 2.98864734291 2.55108846992
+353 0.331970195453 -0.0610323768683
+354 3.03755118164 2.60885420458
+355 1.99226624173 0.99676428002
+356 2.49564151498 2.66230277701
+357 3.53016735288 5.00087780226
+358 0.644262344443 -0.271604962087
+359 0.464189914596 0.576690004603
+360 -1.36138805793 -1.32071875517
+361 1.29463133072 -0.391123429176
+362 3.1874582637 2.62609357278
+363 3.36561298952 0.760355037001
+364 2.8239778861 -0.0144977767168
+365 -0.805577270066 -1.65749263448
+366 1.77114879553 1.70910899824
+367 1.28974044756 1.78068487179
+368 2.29139078531 1.21095725493
+369 0.736334280684 -0.22582516837
+370 2.92700684628 0.184354011636
+371 3.09600365675 3.15873192246
+372 1.99012314199 2.03230399847
+373 2.27876026089 1.52105851734
+374 0.243532378114 -0.664500626984
+375 2.13394143755 -0.445420021546
+376 0.682435215644 0.667472063454
+377 -0.174154686481 1.1077888367
+378 0.245848877399 0.752188547185
+379 -1.95414834123 -1.93478329794
+380 0.251503935402 0.673738731109
+381 1.54317732422 -0.0307492473901
+382 0.744299213055 -0.0679565163054
+383 -0.395810359248 -0.905837428034
+384 -0.92708482271 -0.231525221758
+385 3.45637818675 1.99712897273
+386 -0.624102147725 -1.05414151336
+387 1.84520828541 1.37325304257
+388 0.843144657515 2.80414378921
+389 1.92235471974 0.381194425255
+390 0.410698414571 1.13648779995
+391 3.46436056176 2.71531155103
+392 -0.346913750807 0.0814298700551
+393 0.364803420117 1.57444479004
+394 1.22972839265 0.0854847505351
+395 -1.96011721992 -2.11998060065
+396 -1.13613897562 -0.248986899577
+397 -0.926968731246 -0.774034703661
+398 2.93061369164 3.76228736345
+399 0.96993588573 1.02171222345
+400 -0.534853957736 1.94888525811
+401 0.207986390589 -0.258386537038
+402 1.21754239578 -0.622209647106
+403 3.28534669792 1.45763349464
+404 2.57696834036 0.041225437598
+405 1.66756917118 0.965454385545
+406 0.877947610383 1.11357709835
+407 -2.2692456298 -1.83432562541
+408 1.85945181218 -0.40660686344
+409 -0.345670495056 -0.286242232426
+410 0.19138792386 0.703556123658
+411 4.25313428896 2.57935763904
+412 1.64211901151 1.44565939308
+413 1.71346169688 0.141115442134
+414 1.0514561521 -0.575111556836
+415 0.644952149657 -0.516172606824
+416 0.60506162459 -0.806113441249
+417 2.82764297873 1.50021468229
+418 -0.762532449738 -1.39823173826
+419 1.43953479773 0.43759344655
+420 1.34824576782 -0.200239271406
+421 1.51143952548 2.14181366538
+422 0.483653577551 1.24884626406
+423 0.862375521711 -0.570754184564
+424 -0.82201458081 -2.07352531411
+425 -0.371354712305 0.325447869576
+426 0.705807360286 1.34887472459
+427 1.80620156233 0.103760291976
+428 -0.0274758346697 -0.0290690802303
+429 2.60629663962 2.10933530411
+430 1.84391671156 2.26838590812
+431 -0.396012416076 -0.62002102807
+432 1.19240643007 2.51716590762
+433 0.13990422317 -0.299778153456
+434 3.32238171317 0.480689006118
+435 0.568193328783 -0.501474047425
+436 0.78021865519 0.999992150985
+437 2.83729819379 0.33413407313
+438 1.62920157569 0.176620377991
+439 1.03258924164 0.0290451161013
+440 1.34686580298 0.647663294761
+441 1.82677101754 0.499687315021
+442 2.79261671698 1.92174825068
+443 0.873279177187 0.810097275589
+444 -1.17134258798 -1.74790954386
+445 2.43554616397 1.43966546244
+446 0.047036205671 1.69195510925
+447 1.15202840653 1.06267393051
+448 0.858636668649 2.75250354782
+449 3.39530736739 1.35297509084
+450 2.1960357862 2.59220931043
+451 -1.25435547981 -1.04701839973
+452 0.587753199293 0.0822737714978
+453 2.46760803988 1.71022042776
+454 1.04072684401 1.11879124654
+455 -0.0218070356976 -0.730600301405
+456 -0.11315979219 -1.31631320683
+457 -0.621332309939 0.275285390091
+458 1.01946475309 1.17782403869
+459 -0.940323720693 -1.48185090181
+460 0.323321328974 1.33081581377
+461 0.429284793469 1.76354059365
+462 3.92528553786 3.26992136254
+463 -0.0897120393174 1.22141796103
+464 2.55876575761 2.755719467
+465 0.851624935406 0.288519968642
+466 -1.15309276021 -1.04478568405
+467 1.66651784418 1.00594295195
+468 1.7757136274 0.826137737493
+469 1.65539339468 1.6101025123
+470 -1.28125059035 0.3599539084
+471 1.5159002697 0.55641244575
+472 1.14402298659 -0.473152923088
+473 1.64364305 2.74534262956
+474 0.882148663566 0.00169098851953
+475 -1.42519755646 -1.61400363179
+476 0.593136177159 -0.304645628883
+477 0.763859990243 -0.423708319046
+478 0.582661864364 1.42354130823
+479 1.52747146156 0.767945115182
+480 -1.10531031111 -0.859119636195
+481 1.85044799071 2.34978938125
+482 1.64812398249 0.719158803895
+483 1.40850004304 1.61896989769
+484 -0.242335192278 0.828057832308
+485 1.78240263841 0.0225420336733
+486 -0.867548653409 -0.705582492794
+487 1.07899857083 -0.481520580757
+488 1.19030356659 -1.08759008376
+489 2.32217880207 0.575578544355
+490 4.50152856693 4.14448214823
+491 -0.549818033247 -0.294384714167
+492 -1.41599664799 -0.691971590616
+493 3.32473542845 3.65436524573
+494 0.319243402948 1.02641745853
+495 2.64602276722 1.72260225656
+496 0.581459350573 0.0875056586848
+497 1.48337545557 1.17593955
+498 1.43791364658 2.20321783934
+499 -0.054951309901 0.617746198574
+500 -2.51524804756 -1.65299793719
+501 0.707706570489 0.238416177496
+502 -0.210507387918 0.561471152066
+503 3.34935005849 1.55626857936
+504 2.25893159947 0.883418788758
+505 -0.551856362605 -1.20940721054
+506 -0.357111657339 -0.541731397327
+507 2.69030398946 2.32539649814
+508 2.76738728951 2.22466542296
+509 -0.0230502241641 -0.656042299005
+510 -0.559643435022 -1.92328237899
+511 3.70631218241 2.46866405907
+512 2.5760832061 0.666310723278
+513 3.34139342964 2.03986043458
+514 -1.19637019354 -0.914641322775
+515 0.913680249488 1.3183361458
+516 2.47553136862 1.54688063319
+517 2.57755716189 1.32148058779
+518 2.69408305654 3.17577759616
+519 -0.132640659626 1.34209836103
+520 0.347079135085 -0.397664145488
+521 -0.149620886717 -0.221275807952
+522 -0.104307961184 -1.96629590581
+523 1.02309047193 1.36510346255
+524 1.53457091702 2.3319960714
+525 1.17223268984 1.49616353811
+526 0.722838162556 -0.533783365431
+527 0.0732270712633 0.147895172698
+528 1.75949109451 1.82673523045
+529 3.71205711579 3.0187327817
+530 2.85598870175 2.58854685483
+531 1.48283278082 1.40508708391
+532 3.70047669102 1.76006423213
+533 0.95723262243 -0.889632371204
+534 -0.620924854674 -0.575456287154
+535 0.771403500439 0.0880467124265
+536 0.252818270631 -1.89798671743
+537 -1.30004750181 -1.67430856051
+538 2.72099994964 -0.0695310563377
+539 0.188298270224 -0.781884521008
+540 -0.731548591235 -0.1512938988
+541 -0.162548527384 -0.106372991796
+542 1.94785012869 2.7657596283
+543 0.181244683279 1.57753466484
+544 1.63483407561 0.901427099611
+545 0.680990798372 1.7030152873
+546 0.176065482226 -0.628176004173
+547 -0.0973629697773 -0.311209903318
+548 2.51647892386 1.46073963097
+549 1.1359491635 1.85307726829
+550 0.501595471304 1.20100862138
+551 3.13513313186 0.867108935057
+552 0.483185477534 -1.20929156422
+553 1.41188481789 0.648962724409
+554 0.848795114658 0.921145817252
+555 0.455717542005 -0.323887951802
+556 1.05739986464 1.92966375801
+557 0.209009551163 -0.134139566032
+558 2.69889928814 1.9781607324
+559 2.7021539095 1.02421957709
+560 -0.0499762852346 1.06698372385
+561 0.954853162243 -0.994572239192
+562 0.862081396971 -0.33928579314
+563 -1.04389260722 0.0625005203764
+564 2.07959809531 1.89522527184
+565 -0.354455731839 -0.459654396053
+566 1.33289002318 -0.258409491406
+567 1.22664362358 -0.538422790006
+568 1.85319098534 2.46894472251
+569 -0.773593443763 0.760304214909
+570 1.26886212145 1.01553396799
+571 0.600288942806 -0.695384004503
+572 -0.558720368167 -0.144849184108
+573 2.56041675691 0.422204406765
+574 0.621531842267 -0.694718260905
+575 1.96381746508 1.37604102859
+576 2.36639403121 0.910706437437
+577 0.745495746681 0.51358312913
+578 1.51790546593 0.794246272017
+579 0.389018556536 1.55586522426
+580 0.433758147035 0.528897074213
+581 -1.55256303032 -2.3754293018
+582 0.184450867512 -0.15905721576
+583 2.54948109095 3.1254875726
+584 1.14828618691 -0.0843092632523
+585 2.33209323664 1.66402366494
+586 -0.0800109018387 -1.75838637268
+587 -1.00844915938 -0.896658080855
+588 2.44823145777 2.14441460999
+589 1.93791785974 1.05446297094
+590 0.699543633024 1.14524105932
+591 2.39983569247 0.404477511844
+592 1.06224023399 1.7535587029
+593 0.449649937011 0.604164300585
+594 2.16381281371 2.52871632163
+595 0.202556124432 0.659684476728
+596 2.04442187104 2.82750827022
+597 1.05163557156 0.793426866761
+598 0.756306843977 0.364035049703
+599 1.29372392653 1.53808833406
+600 0.107768664125 0.229392992968
+601 -0.024931023902 -0.109322429324
+602 -1.31647048544 -1.27102598751
+603 -1.82261536085 -1.53173913738
+604 2.43390637638 0.901979140647
+605 1.18739929034 1.58844755905
+606 0.682575412605 0.942356211445
+607 1.72661132706 1.54228771268
+608 1.09784500572 0.203670021065
+609 0.135745558019 0.342522546785
+610 -1.26983741674 -2.38814247677
+611 3.13900684267 1.88692666516
+612 0.582074615507 -0.397992306911
+613 0.398458058796 0.56804059781
+614 0.46932455806 3.4971371696
+615 0.0189051768259 -0.469971970375
+616 -1.73663542449 -1.82888091158
+617 2.39126944327 1.75614054714
+618 2.330704119 0.497628422207
+619 0.860243338672 -0.173432445718
+620 -0.82922662282 0.0377160091238
+621 0.81323034143 0.903578825373
+622 1.78655855429 2.08634115414
+623 2.53386201781 2.72907842797
+624 1.6595624772 1.68631110703
+625 1.01274922899 -0.924392514626
+626 1.55235360179 1.93135760231
+627 1.26986028743 3.01424240634
+628 -0.0818235126635 0.264194227246
+629 2.34427274803 0.422640768676
+630 0.704434142455 -0.381521808242
+631 -0.080373023695 -1.608530912
+632 2.98137648224 2.22977769074
+633 1.11048720698 1.7384621194
+634 0.473409960962 0.771341610389
+635 1.92848421628 -0.0358306458715
+636 1.27128444858 0.295774496882
+637 0.742460032623 1.00754812606
+638 2.54472403068 -0.117012094572
+639 1.64023134703 0.191446380975
+640 0.108242555083 0.402774596948
+641 1.34129338843 0.875822419311
+642 2.59365469374 1.15739225165
+643 2.38238502982 1.26201370577
+644 3.21605932534 2.25069346915
+645 -0.257191878061 -0.635794769293
+646 1.58120925084 1.93382930661
+647 0.196338196368 1.19819467207
+648 -0.0702928909198 0.814300383707
+649 2.90247651766 1.61171211116
+650 -0.795427816172 -0.334436514489
+651 -0.629413482366 -0.632207380878
+652 -0.144084499749 0.268961248025
+653 -0.218517105267 0.18090866134
+654 1.85504616389 0.53778176331
+655 0.31802429935 1.28324726681
+656 1.71662212823 1.39989361292
+657 1.31079937308 1.75203449282
+658 0.121256610451 -0.739908189767
+659 0.872820977551 0.00786030110003
+660 -2.17112267381 -1.19569712895
+661 1.78386403001 1.68310930688
+662 4.13563491156 3.62949830987
+663 1.55872947975 1.47601648782
+664 -1.30704425209 0.130223272766
+665 1.32529528201 0.849696753453
+666 1.90129032489 0.166695068933
+667 0.076235502795 -0.367090988023
+668 2.19672153365 1.82760888294
+669 0.941627988147 1.37958941224
+670 0.919497036965 0.785202218094
+671 0.771242580381 -1.13983331474
+672 2.06072210209 1.33052071981
+673 1.10244358328 1.57313731438
+674 -0.518351681232 -0.949343060309
+675 -0.285158070242 -1.14517304978
+676 0.412628834613 0.552094092824
+677 1.74530137233 0.777303741217
+678 -1.4224740359 -1.87510635769
+679 1.7987846836 1.13845291772
+680 0.347638099719 -1.12462301637
+681 0.0423545317715 1.11703720277
+682 0.47931520382 1.16174150339
+683 2.5182846848 2.8430338799
+684 -0.53291880548 -0.691754899762
+685 2.00792387483 1.52563103564
+686 1.14757033437 -0.582790146871
+687 1.23890784888 0.26758176499
+688 1.07185137023 -0.251913873566
+689 -0.521422466757 -1.06512000548
+690 -1.16627435725 -1.61838207345
+691 2.62582470673 -0.708647647917
+692 0.927083088507 0.303194726658
+693 1.89149893164 0.871389589822
+694 2.78545761048 2.21607723079
+695 2.24167346642 2.23458216656
+696 1.63806622857 2.24039408661
+697 -1.22283458191 -0.640019667177
+698 0.672886103152 -0.962509253367
+699 0.873633700786 -0.54571343013
+700 -0.802528020974 0.461416284531
+701 1.49223590923 1.74535373436
+702 1.03515292342 0.482168735426
+703 3.06804396866 0.941427966364
+704 0.117450757972 0.999379672059
+705 0.794262222462 -0.576695555004
+706 0.578981865972 0.30219121373
+707 0.781115803816 -0.164624472457
+708 1.23349345717 -0.29006323419
+709 -0.968659832993 0.0187354829897
+710 1.93045915012 0.940992209236
+711 -0.122949763517 0.773706898298
+712 -0.679865989972 -0.803755215284
+713 1.99127854928 2.19108721572
+714 0.770188966715 0.181234133761
+715 1.04894360789 0.629784906484
+716 -1.50777312772 -1.90315356357
+717 2.08072610849 0.241125531301
+718 0.516566417523 0.601677120115
+719 2.48915162628 1.40798804137
+720 -0.0810499763129 -1.49750921116
+721 0.423565134594 1.19052891122
+722 1.01729208108 1.48895714018
+723 1.97395382212 2.37853447506
+724 1.16172158107 0.501419445567
+725 1.74636873968 0.202404415846
+726 2.76098806811 1.24594676229
+727 0.702350501588 3.04336364422
+728 0.381198427648 1.39123779182
+729 2.20063972477 0.253352528355
+730 0.719136996913 0.195250256714
+731 1.69135392323 2.44520086316
+732 1.27488343113 0.428048876532
+733 2.63889134813 1.98492284507
+734 3.65405903105 3.95915048069
+735 0.345857655669 -1.29313018738
+736 1.54513482748 1.49321278305
+737 1.546346945 0.168920489752
+738 0.473136534464 -0.11495000625
+739 2.85501489758 1.51218407092
+740 1.08513832229 -0.297152256315
+741 2.14250508117 1.38144408018
+742 -0.64248501234 0.604219434669
+743 3.1930042928 2.62975785536
+744 2.14268100963 0.790411366877
+745 0.478476679674 1.12517549527
+746 0.881922915425 -0.0105471539547
+747 0.207762867153 -0.354017351578
+748 0.5512203494 0.699690801314
+749 2.53940394049 0.439553899412
+750 0.689095197985 -0.11417235544
+751 0.368738470934 1.39502234069
+752 1.56853514551 -0.0986904654701
+753 1.47725061734 -0.302153692792
+754 3.95431553622 3.32403631499
+755 -0.593928919808 -1.0870508988
+756 2.47617110499 0.208571088906
+757 1.21173003226 1.0169652282
+758 1.79966699581 -0.118926947449
+759 -1.95603723125 -2.66031635992
+760 1.6364408415 0.369168540587
+761 3.68733003563 1.65079487875
+762 0.910651399587 -1.51764411049
+763 0.498347922546 0.864412387165
+764 2.64005783357 1.85007911092
+765 -0.921641622345 -1.68214715581
+766 1.48622970844 0.865787462035
+767 4.10914523665 2.83205801212
+768 1.16894979758 -0.0419499265835
+769 1.10454460668 -0.280781157321
+770 3.09848288591 2.91336556876
+771 0.0205280911318 0.585216428684
+772 2.25195468355 2.47138111513
+773 -0.393785540589 1.31237658657
+774 2.44987615939 1.50871122351
+775 -1.42290539727 -0.543922714866
+776 0.264522068134 -0.393013424724
+777 1.27337073874 0.99285439127
+778 0.284958276437 2.18866703366
+779 -1.35688598311 -1.75831106473
+780 0.883624518491 1.9268230796
+781 0.658539500773 0.308340424885
+782 1.97650319329 1.35358898197
+783 2.78642704003 0.76260729985
+784 0.303003797663 0.609165640586
+785 0.846870288716 1.37905688263
+786 0.439755999879 0.589612157318
+787 0.77369067662 0.663375025549
+788 2.11135316664 1.63859008624
+789 2.48738092959 1.67675770364
+790 -0.147201424882 -0.95510908649
+791 3.11996702224 1.38384976649
+792 -0.949101712279 0.341171178239
+793 -0.49976552277 -0.585618939724
+794 -0.575694326033 -0.998780146074
+795 0.388075696638 -0.499659057571
+796 1.33288157367 0.280856025717
+797 0.182759647577 -0.805879895855
+798 1.04670623936 2.14891308835
+799 3.50164708714 2.26032748567
+800 0.314301935466 -1.2098833698
+801 -0.16303592247 -0.761875177208
+802 0.333932712592 -0.0573072574812
+803 -0.193940291302 0.0813898381062
+804 -0.425833712589 -2.97063981936
+805 0.958612622287 1.37662721098
+806 0.86046295315 -0.227812705577
+807 1.74545789191 1.61407938645
+808 0.17095271104 -0.416615895056
+809 0.414826568371 -1.0751635932
+810 0.179262304392 -0.849578286863
+811 1.36997523195 0.860590139708
+812 0.954451295335 1.56464965595
+813 3.75661089471 2.4074840736
+814 -0.141398924648 -2.31974269819
+815 -0.463086999609 -2.60191803028
+816 3.04971309137 0.981371683431
+817 -0.853289156873 -0.911262472072
+818 0.532007842028 -0.250193592559
+819 1.62731498622 0.735740744244
+820 0.91741152559 1.59476352315
+821 1.78562998584 0.9712274179
+822 1.22284984049 0.787769708278
+823 -0.498598906454 -0.263356381842
+824 3.13959578933 0.950426953037
+825 1.87587066587 1.21642617458
+826 1.15759839629 0.223120257653
+827 2.66375323143 2.21839752102
+828 0.773248041353 -0.741366534795
+829 2.57979307424 1.79543039298
+830 0.955745314921 2.8131791742
+831 -0.73406604712 0.193723639099
+832 1.17838621491 2.2287375112
+833 0.244070473952 0.305975143643
+834 -0.455082520621 -0.445735495187
+835 1.06082269156 0.344762814998
+836 1.38638278887 1.26187721367
+837 0.800177289425 0.589589221976
+838 -0.679473810377 0.149150897477
+839 0.745925905627 -0.00516350633378
+840 -0.63894935311 1.35534736522
+841 0.821658724263 0.831925413546
+842 0.217720386297 0.541045629592
+843 1.38191189765 0.836283402612
+844 -0.938073227963 -1.6416755797
+845 1.7594205273 -0.680239387468
+846 1.32591092739 -0.244877204286
+847 0.100768421929 -1.56796817086
+848 0.844922840698 0.30584695745
+849 -1.58099536056 -2.62627452209
+850 1.76622026799 0.698100700844
+851 1.24214718355 0.965692674522
+852 2.24960634507 2.0170367204
+853 0.374345340997 -0.0627748990537
+854 1.23739327985 0.905745201082
+855 -0.0152489177331 -0.208323033525
+856 0.0412648598923 0.0296510634187
+857 -0.907963876031 0.210880386575
+858 -1.03979333341 0.542812908539
+859 1.30593025654 -0.180957133017
+860 2.33774502024 2.97141976196
+861 -1.00528625619 -0.285059737178
+862 0.336404926328 -0.322909569595
+863 -1.18774788343 -0.248907334337
+864 0.690607293025 1.29529974786
+865 1.83733232837 0.778739706292
+866 2.47233172863 2.85257529304
+867 2.26775201989 1.20141923313
+868 -0.605536648361 -0.853580241723
+869 1.95321163726 1.1069951393
+870 -0.847670557405 -0.0376837670101
+871 2.64481472334 2.37832784947
+872 1.62552294377 0.665024634253
+873 0.909541491134 -0.27001129735
+874 1.76279998016 1.5396929052
+875 1.8863608414 1.64966199293
+876 1.28910788772 -0.679628494136
+877 1.32521922443 0.735643704301
+878 1.94993363559 1.06525179066
+879 -0.56442660719 -1.29379541158
+880 0.592008072002 1.88961991423
+881 1.19303821282 2.3024598091
+882 -0.924424821381 0.376912066102
+883 0.576964556025 0.0790701184954
+884 2.66837357916 1.47118397571
+885 1.39964009578 -1.00292327956
+886 -0.117619929023 -1.20732307332
+887 2.54762431737 2.19768496347
+888 3.25757915342 0.5666592764
+889 0.748920377805 -0.315329662266
+890 -1.34068440362 -2.24910172069
+891 -0.505745582142 -1.42927801828
+892 1.20211089992 -0.0881234698419
+893 1.78394851583 1.09896641282
+894 2.88490147738 0.0272246780127
+895 1.36697230567 -0.0805015655804
+896 -0.84193533413 0.254075787589
+897 0.5762762599 -0.589706236766
+898 0.698698198074 0.144302062831
+899 1.00395105878 0.0828954089461
+900 0.283279737141 0.0461786274254
+901 0.336761658583 -0.519875647444
+902 2.98398502725 0.897923709223
+903 1.50054355717 1.34169503427
+904 2.72336962172 1.0934303171
+905 1.04386747176 -0.259867944844
+906 2.26636034462 2.34290355893
+907 1.06783566934 -0.271804958053
+908 0.870769551483 -0.565458859058
+909 -0.355020161729 -0.0465272569999
+910 1.47792780553 2.20457016093
+911 2.3213067274 3.48508785754
+912 2.22647376583 1.60059264483
+913 -2.56553794385 -1.50609738648
+914 -1.86276295813 -0.162042653801
+915 0.996847460844 1.04251234985
+916 0.237811244321 -1.40828452583
+917 2.29532044329 2.26795430785
+918 -1.42058277927 -0.139976660562
+919 -1.14982270238 -0.940401009527
+920 1.29488330811 0.625097008703
+921 1.75669056948 0.774900325832
+922 1.58638974075 1.62323828463
+923 2.24184259658 1.66172380299
+924 1.25555724841 0.703039346719
+925 -0.375004313818 -0.287342136508
+926 1.15123529 2.32815302898
+927 1.04939658961 1.57645783759
+928 -0.111237564066 -0.575666981136
+929 1.11648290176 0.0839548085496
+930 0.896401504196 -0.591658487194
+931 -1.54828369019 -2.09130174917
+932 -2.10210444811 -1.00953307381
+933 0.578744043015 1.06812809264
+934 1.9031494339 2.58891974662
+935 0.650688410389 -0.333344575395
+936 1.45490884712 -1.07890749401
+937 -2.29271663116 -1.02590301386
+938 -0.553520580323 -0.876623533905
+939 0.295435790386 0.154151106451
+940 0.739552197854 -1.45056938461
+941 0.550596269191 -0.211490309288
+942 2.06617139015 1.68357498934
+943 1.37009335914 0.248966043502
+944 -0.779723180082 1.04185667777
+945 0.657346131671 0.395013846374
+946 1.35793985771 1.23005653641
+947 0.346675905403 0.350476402606
+948 1.90143214681 -1.15790392208
+949 1.53755048835 1.16322562472
+950 1.79676676874 0.215953081403
+951 -0.653794703493 -1.25304663149
+952 3.22998948681 2.83965358638
+953 1.38852441013 1.0349202775
+954 0.6407225295 -0.148904121116
+955 4.85000442842 2.46644798213
+956 -0.571640043081 0.243693414108
+957 -0.854429704873 -1.20906016632
+958 -0.874078069668 -0.222990486174
+959 2.77410991574 3.23952664504
+960 0.276074094834 0.439327093322
+961 1.54579862229 1.1440556871
+962 2.23366155194 -0.359657105833
+963 2.21934095707 1.79156902781
+964 2.86300126621 1.96212364234
+965 0.00486634868158 0.669181762001
+966 0.539941607917 -0.504574984575
+967 1.1309972889 0.885170985788
+968 1.6692167276 1.33735722262
+969 -0.0137451416733 0.553616395605
+970 2.13229368105 1.28967806398
+971 1.0500030028 0.509911061302
+972 1.5058172904 0.0852260723227
+973 0.325847822973 0.186023614173
+974 -0.245039097176 0.337306470895
+975 2.16487426907 1.32405596035
+976 4.19696845179 1.75410942942
+977 2.06140151241 2.8798802773
+978 2.61222166917 0.851089587134
+979 -0.714530716835 -1.46320620395
+980 1.32899995942 1.55150260174
+981 -0.394233621705 -0.856512866842
+982 1.81059648553 0.714173342419
+983 1.12703896172 1.24851248903
+984 2.95556810589 1.94860146185
+985 1.70187203768 -1.00603168336
+986 3.43828180843 1.80565165287
+987 2.11199388028 1.28652551778
+988 0.335427288053 0.406678273009
+989 -1.72386638965 -1.70394842507
+990 1.22086004787 0.708041438911
+991 -1.03093269869 -0.121198424122
+992 1.14306483245 -0.520134639627
+993 1.21535477617 0.362732243366
+994 0.793234640358 -0.356093010693
+995 2.41023312707 1.5279948071
+996 1.14798568608 0.905693333918
+997 1.13789136769 1.25958660988
+998 1.1508991036 0.767507215707
+999 1.34103725484 1.00348213522
diff --git a/regtest/analysis/rt-pca-2/config b/regtest/analysis/rt-pca-2/config
new file mode 100644
index 0000000000000000000000000000000000000000..58f0b8893460c32a1c9a774c2f8e59d2d9761a08
--- /dev/null
+++ b/regtest/analysis/rt-pca-2/config
@@ -0,0 +1,3 @@
+type=driver
+arg="--plumed plumed.dat --ixyz diala_traj_nm.xyz"
+extra_files="../../trajectories/diala_traj_nm.xyz"
diff --git a/regtest/analysis/rt-pca-2/pca-comp.pdb.reference b/regtest/analysis/rt-pca-2/pca-comp.pdb.reference
new file mode 100644
index 0000000000000000000000000000000000000000..4a7972a92b559a3d72825fe6b206e96622e2a667
--- /dev/null
+++ b/regtest/analysis/rt-pca-2/pca-comp.pdb.reference
@@ -0,0 +1,12 @@
+REMARK TYPE=EUCLIDEAN
+REMARK ARG=c1,c2
+REMARK c1=0.9663   c2=0.4730   
+END
+REMARK TYPE=DIRECTION
+REMARK ARG=c1,c2
+REMARK c1=0.7142   c2=0.6999   
+END
+REMARK TYPE=DIRECTION
+REMARK ARG=c1,c2
+REMARK c1=0.6999   c2=-0.7142  
+END
diff --git a/regtest/analysis/rt-pca-2/plumed.dat b/regtest/analysis/rt-pca-2/plumed.dat
new file mode 100644
index 0000000000000000000000000000000000000000..aa6a39f97daf852438292389b69aa52ef43389da
--- /dev/null
+++ b/regtest/analysis/rt-pca-2/plumed.dat
@@ -0,0 +1,4 @@
+c1: READ FILE=colvar.in VALUES=cv1
+c2: READ FILE=colvar.in VALUES=cv2
+
+PCA METRIC=EUCLIDEAN ARG=c1,c2 USE_ALL_DATA NLOW_DIM=2 OFILE=pca-comp.pdb FMT=%8.4f
diff --git a/regtest/analysis/rt-pca/Makefile b/regtest/analysis/rt-pca/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3703b27cea227aa053fb6d1d73f861e4384dbcee
--- /dev/null
+++ b/regtest/analysis/rt-pca/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/analysis/rt-pca/config b/regtest/analysis/rt-pca/config
new file mode 100644
index 0000000000000000000000000000000000000000..58f0b8893460c32a1c9a774c2f8e59d2d9761a08
--- /dev/null
+++ b/regtest/analysis/rt-pca/config
@@ -0,0 +1,3 @@
+type=driver
+arg="--plumed plumed.dat --ixyz diala_traj_nm.xyz"
+extra_files="../../trajectories/diala_traj_nm.xyz"
diff --git a/regtest/analysis/rt-pca/pca-comp.pdb.reference b/regtest/analysis/rt-pca/pca-comp.pdb.reference
new file mode 100644
index 0000000000000000000000000000000000000000..fc22670a085813745bf312a3803ddbea39367c40
--- /dev/null
+++ b/regtest/analysis/rt-pca/pca-comp.pdb.reference
@@ -0,0 +1,72 @@
+REMARK TYPE=OPTIMAL
+ATOM     1  X    RES     0    -3.002  -0.303   1.042  0.05  0.05
+ATOM     2  X    RES     1    -3.375  -0.131   1.046  0.05  0.05
+ATOM     3  X    RES     2    -3.376  -0.402   1.084  0.05  0.05
+ATOM     4  X    RES     3    -3.326  -0.239   1.294  0.05  0.05
+ATOM     5  X    RES     4    -1.625  -0.479   0.681  0.05  0.05
+ATOM     6  X    RES     5    -1.001  -0.681   1.355  0.05  0.05
+ATOM     7  X    RES     6    -1.117  -0.412  -0.407  0.05  0.05
+ATOM     8  X    RES     7    -1.688  -0.225  -0.895  0.05  0.05
+ATOM     9  X    RES     8     0.167  -0.648  -0.993  0.05  0.05
+ATOM    10  X    RES     9     0.264  -1.677  -1.119  0.05  0.05
+ATOM    11  X    RES    10     0.199  -0.014  -2.349  0.05  0.05
+ATOM    12  X    RES    11     0.278   0.101  -2.643  0.05  0.05
+ATOM    13  X    RES    12     0.168   0.098  -2.737  0.05  0.05
+ATOM    14  X    RES    13     0.174   0.229  -2.704  0.05  0.05
+ATOM    15  X    RES    14     1.392  -0.272  -0.203  0.05  0.05
+ATOM    16  X    RES    15     2.259  -0.716  -0.363  0.05  0.05
+ATOM    17  X    RES    16     1.500   0.569   0.661  0.05  0.05
+ATOM    18  X    RES    17     0.762   0.889   0.785  0.05  0.05
+ATOM    19  X    RES    18     2.611   0.994   1.452  0.05  0.05
+ATOM    20  X    RES    19     2.964   1.128   1.574  0.05  0.05
+ATOM    21  X    RES    20     2.843   1.185   1.726  0.05  0.05
+ATOM    22  X    RES    21     2.928   1.006   1.713  0.05  0.05
+END
+REMARK TYPE=DIRECTION
+ATOM     1  X    RES     0     0.354   0.171   0.229  1.00  1.00
+ATOM     2  X    RES     1     0.585   0.567   0.558  1.00  1.00
+ATOM     3  X    RES     2     0.205   0.793   0.037  1.00  1.00
+ATOM     4  X    RES     3     0.320   0.180   0.111  1.00  1.00
+ATOM     5  X    RES     4     0.255  -1.235   0.169  1.00  1.00
+ATOM     6  X    RES     5     0.252  -4.529   0.304  1.00  1.00
+ATOM     7  X    RES     6     0.252   1.215  -0.073  1.00  1.00
+ATOM     8  X    RES     7     0.435   3.800  -0.249  1.00  1.00
+ATOM     9  X    RES     8     0.107   0.313  -0.066  1.00  1.00
+ATOM    10  X    RES     9     0.093   0.161   1.087  1.00  1.00
+ATOM    11  X    RES    10    -0.089  -1.113  -0.746  1.00  1.00
+ATOM    12  X    RES    11     0.480  -3.066  -1.776  1.00  1.00
+ATOM    13  X    RES    12    -2.025  -1.152  -0.684  1.00  1.00
+ATOM    14  X    RES    13     1.185  -0.261  -0.272  1.00  1.00
+ATOM    15  X    RES    14     0.099   1.372  -0.666  1.00  1.00
+ATOM    16  X    RES    15     0.492   3.830  -2.307  1.00  1.00
+ATOM    17  X    RES    16    -0.394  -0.438   0.775  1.00  1.00
+ATOM    18  X    RES    17    -0.608  -2.369   1.984  1.00  1.00
+ATOM    19  X    RES    18    -0.479   0.302   0.455  1.00  1.00
+ATOM    20  X    RES    19    -0.662   0.700   0.477  1.00  1.00
+ATOM    21  X    RES    20    -0.547   0.209   0.491  1.00  1.00
+ATOM    22  X    RES    21    -0.309   0.548   0.162  1.00  1.00
+END
+REMARK TYPE=DIRECTION
+ATOM     1  X    RES     0     0.040  -0.180   0.140  1.00  1.00
+ATOM     2  X    RES     1     0.469  -1.519   2.216  1.00  1.00
+ATOM     3  X    RES     2     0.337   2.166   0.164  1.00  1.00
+ATOM     4  X    RES     3    -0.723  -1.344  -1.892  1.00  1.00
+ATOM     5  X    RES     4     0.011  -0.017   0.057  1.00  1.00
+ATOM     6  X    RES     5     0.044   0.234  -0.008  1.00  1.00
+ATOM     7  X    RES     6    -0.006  -0.149   0.077  1.00  1.00
+ATOM     8  X    RES     7    -0.009  -0.335   0.106  1.00  1.00
+ATOM     9  X    RES     8    -0.007  -0.045   0.002  1.00  1.00
+ATOM    10  X    RES     9     0.008  -0.058  -0.089  1.00  1.00
+ATOM    11  X    RES    10    -0.099   0.067   0.048  1.00  1.00
+ATOM    12  X    RES    11     2.890  -0.426  -0.116  1.00  1.00
+ATOM    13  X    RES    12    -2.066  -1.963  -0.898  1.00  1.00
+ATOM    14  X    RES    13    -1.117   2.728   1.157  1.00  1.00
+ATOM    15  X    RES    14     0.018  -0.063   0.016  1.00  1.00
+ATOM    16  X    RES    15     0.077  -0.275   0.135  1.00  1.00
+ATOM    17  X    RES    16    -0.013   0.160  -0.129  1.00  1.00
+ATOM    18  X    RES    17    -0.037   0.355  -0.233  1.00  1.00
+ATOM    19  X    RES    18     0.046   0.169  -0.200  1.00  1.00
+ATOM    20  X    RES    19     2.278  -3.179  -1.474  1.00  1.00
+ATOM    21  X    RES    20     0.240   3.803  -2.326  1.00  1.00
+ATOM    22  X    RES    21    -2.383  -0.130   3.246  1.00  1.00
+END
diff --git a/regtest/analysis/rt-pca/plumed.dat b/regtest/analysis/rt-pca/plumed.dat
new file mode 100644
index 0000000000000000000000000000000000000000..3bb806c3ad6fc770728daf3c1325763aefb4a55f
--- /dev/null
+++ b/regtest/analysis/rt-pca/plumed.dat
@@ -0,0 +1 @@
+PCA METRIC=OPTIMAL ATOMS=1-22 STRIDE=1 USE_ALL_DATA NLOW_DIM=2 OFILE=pca-comp.pdb
diff --git a/regtest/basic/rt-drmsd/Makefile b/regtest/basic/rt-drmsd/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3703b27cea227aa053fb6d1d73f861e4384dbcee
--- /dev/null
+++ b/regtest/basic/rt-drmsd/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/basic/rt-drmsd/colvar.reference b/regtest/basic/rt-drmsd/colvar.reference
new file mode 100644
index 0000000000000000000000000000000000000000..a17ae424695cc365373def1ee8fb3a320b244d39
--- /dev/null
+++ b/regtest/basic/rt-drmsd/colvar.reference
@@ -0,0 +1,6 @@
+#! FIELDS time intradrmsd ff finter ff4
+ 0.000000   1.3885   1.3885   1.6237   1.6237
+ 0.050000   1.3763   1.3763   1.6260   1.6260
+ 0.100000   1.3651   1.3651   1.6224   1.6224
+ 0.150000   1.3731   1.3731   1.5951   1.5951
+ 0.200000   1.4024   1.4024   1.5263   1.5263
diff --git a/regtest/basic/rt-drmsd/config b/regtest/basic/rt-drmsd/config
new file mode 100644
index 0000000000000000000000000000000000000000..4804b7f2a86c1dcb06f52d08f8dff331e318192f
--- /dev/null
+++ b/regtest/basic/rt-drmsd/config
@@ -0,0 +1,4 @@
+type=driver
+# this is to test a different name
+arg="--plumed plumed.dat --trajectory-stride 10 --timestep 0.005 --ixyz trajectory.xyz --dump-forces forces --dump-forces-fmt=%8.4f"
+extra_files="../../trajectories/trajectory.xyz"
diff --git a/regtest/basic/rt-drmsd/plumed.dat b/regtest/basic/rt-drmsd/plumed.dat
new file mode 100644
index 0000000000000000000000000000000000000000..8102c228f0de01c08e2537b0572f4720cffedd95
--- /dev/null
+++ b/regtest/basic/rt-drmsd/plumed.dat
@@ -0,0 +1,15 @@
+intradrmsd: DRMSD TYPE=INTRA-DRMSD REFERENCE=test0.pdb LOWER_CUTOFF=0.0 UPPER_CUTOFF=10.0
+interdrmsd: DRMSD TYPE=INTER-DRMSD REFERENCE=test0.pdb LOWER_CUTOFF=0.0 UPPER_CUTOFF=10.0
+
+drmsd1: DRMSD REFERENCE=test1.pdb LOWER_CUTOFF=0.0 UPPER_CUTOFF=10.0
+drmsd2: DRMSD REFERENCE=test2.pdb LOWER_CUTOFF=0.0 UPPER_CUTOFF=10.0
+drmsd3: DRMSD REFERENCE=test3.pdb LOWER_CUTOFF=0.0 UPPER_CUTOFF=10.0
+
+ff2: COMBINE ARG=drmsd1,drmsd2 POWERS=2,2 COEFFICIENTS=0.66667,0.33333 PERIODIC=NO
+ff: COMBINE ARG=ff2 POWERS=0.5 PERIODIC=NO
+
+ff4: COMBINE ARG=drmsd3,ff2 POWERS=2,1 COEFFICIENTS=1,-0.42857 PERIODIC=NO
+finter: COMBINE ARG=interdrmsd POWERS=2 COEFFICIENTS=0.57143 PERIODIC=NO
+
+PRINT ARG=intradrmsd,ff,finter,ff4 FILE=colvar FMT=%8.4f
+
diff --git a/regtest/basic/rt-drmsd/test0.pdb b/regtest/basic/rt-drmsd/test0.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..c69988c9d8343f23dcf8c004d9652745ed3700d6
--- /dev/null
+++ b/regtest/basic/rt-drmsd/test0.pdb
@@ -0,0 +1,9 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.00  1.00      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
+TER
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  1.00  1.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+TER
diff --git a/regtest/basic/rt-drmsd/test1.pdb b/regtest/basic/rt-drmsd/test1.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..4f899a4bd2275d48b50b861dada9bfb764a18ca1
--- /dev/null
+++ b/regtest/basic/rt-drmsd/test1.pdb
@@ -0,0 +1,4 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.00  1.00      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
diff --git a/regtest/basic/rt-drmsd/test2.pdb b/regtest/basic/rt-drmsd/test2.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..60535a4a96b2e70f0aed47e5c7488a84a10787d2
--- /dev/null
+++ b/regtest/basic/rt-drmsd/test2.pdb
@@ -0,0 +1,4 @@
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  1.00  1.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+TER
diff --git a/regtest/basic/rt-drmsd/test3.pdb b/regtest/basic/rt-drmsd/test3.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..5c470a480d9085cdfa1e62bd49a16311aa5399e6
--- /dev/null
+++ b/regtest/basic/rt-drmsd/test3.pdb
@@ -0,0 +1,8 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.00  1.00      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  1.00  1.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+TER
diff --git a/regtest/basic/rt-mpi7/COLVAR.0.reference b/regtest/basic/rt-mpi7/COLVAR.0.reference
new file mode 100644
index 0000000000000000000000000000000000000000..1a5718c4dbbe04d43b649d74bab59466dd83f09f
--- /dev/null
+++ b/regtest/basic/rt-mpi7/COLVAR.0.reference
@@ -0,0 +1,11 @@
+#! FIELDS time d1 d2 meta1.bias meta2.bias restart.bias
+ 0.000000 1.262593 1.097205 0.000000 0.000000 0.810970
+ 0.050000 1.317587 1.058762 0.000000 0.000000 0.886420
+ 0.100000 1.393388 1.095819 0.000000 0.000000 1.059839
+ 0.150000 1.475479 1.162849 0.321330 0.321330 0.867021
+ 0.200000 1.490756 1.216033 0.248441 0.248441 0.701164
+ 0.250000 1.262593 1.097205 0.419631 0.419631 0.810970
+ 0.300000 1.317587 1.058762 0.460037 0.460037 0.886420
+ 0.350000 1.393388 1.095819 0.781166 0.781166 1.059839
+ 0.400000 1.475479 1.162849 0.593033 0.593033 0.867021
+ 0.450000 1.490756 1.216033 0.701164 0.701164 0.701164
diff --git a/regtest/basic/rt-mpi7/COLVAR.1.reference b/regtest/basic/rt-mpi7/COLVAR.1.reference
new file mode 100644
index 0000000000000000000000000000000000000000..05239af624b9f0c5c012c6e545e89777a812b2ae
--- /dev/null
+++ b/regtest/basic/rt-mpi7/COLVAR.1.reference
@@ -0,0 +1,11 @@
+#! FIELDS time d1 d2 meta1.bias meta2.bias restart.bias
+ 0.000000 1.490756 1.216033 0.000000 0.000000 0.608488
+ 0.050000 1.475479 1.162849 0.000000 0.000000 0.777038
+ 0.100000 1.393388 1.095819 0.000000 0.000000 1.015893
+ 0.150000 1.317587 1.058762 0.371518 0.371518 0.884771
+ 0.200000 1.262593 1.097205 0.333056 0.333056 0.817604
+ 0.250000 1.490756 1.216033 0.277303 0.277303 0.608488
+ 0.300000 1.475479 1.162849 0.363420 0.363420 0.777038
+ 0.350000 1.393388 1.095819 0.785000 0.785000 1.015893
+ 0.400000 1.317587 1.058762 0.665250 0.665250 0.884771
+ 0.450000 1.262593 1.097205 0.817604 0.817604 0.817604
diff --git a/regtest/basic/rt-mpi7/COLVAR.2.reference b/regtest/basic/rt-mpi7/COLVAR.2.reference
new file mode 100644
index 0000000000000000000000000000000000000000..7c9e1a2b57e65742301f519f23133a35ebf48517
--- /dev/null
+++ b/regtest/basic/rt-mpi7/COLVAR.2.reference
@@ -0,0 +1,11 @@
+#! FIELDS time d1 d2 meta1.bias meta2.bias restart.bias
+ 0.000000 1.317587 1.058762 0.000000 0.000000 0.748387
+ 0.050000 1.393388 1.095819 0.000000 0.000000 0.903920
+ 0.100000 1.262593 1.097205 0.000000 0.000000 0.696759
+ 0.150000 1.490756 1.216033 0.028881 0.028881 0.633495
+ 0.200000 1.475479 1.162849 0.042140 0.042140 0.769058
+ 0.250000 1.317587 1.058762 0.284901 0.284901 0.748387
+ 0.300000 1.393388 1.095819 0.347138 0.347138 0.903920
+ 0.350000 1.262593 1.097205 0.616792 0.616792 0.696759
+ 0.400000 1.490756 1.216033 0.512368 0.512368 0.633495
+ 0.450000 1.475479 1.162849 0.769058 0.769058 0.769058
diff --git a/regtest/basic/rt-mpi7/grid.0.reference b/regtest/basic/rt-mpi7/grid.0.reference
index eba63d858b83f3706f097b848ffd8a90a827381e..230fe5b5d9fd1b369d3369ef43304441be072aae 100644
--- a/regtest/basic/rt-mpi7/grid.0.reference
+++ b/regtest/basic/rt-mpi7/grid.0.reference
@@ -1,4 +1,4 @@
-#! FIELDS d1 d2 @3.bias der_d1 der_d2
+#! FIELDS d1 d2 meta1.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/grid.1.reference b/regtest/basic/rt-mpi7/grid.1.reference
index f68da103b2e18c7283d0d1de06dbb3659ce1fc01..5fe44963023632d4a3978d386b7c8b046fbe248b 100644
--- a/regtest/basic/rt-mpi7/grid.1.reference
+++ b/regtest/basic/rt-mpi7/grid.1.reference
@@ -1,4 +1,4 @@
-#! FIELDS d1 d2 @3.bias der_d1 der_d2
+#! FIELDS d1 d2 meta1.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/grid.2.reference b/regtest/basic/rt-mpi7/grid.2.reference
index eb0d399a96a6f8fb64d9b5836cbe5d707de4378e..840a36e466c9602667da1adbb964c3cb70b37b1c 100644
--- a/regtest/basic/rt-mpi7/grid.2.reference
+++ b/regtest/basic/rt-mpi7/grid.2.reference
@@ -1,4 +1,4 @@
-#! FIELDS d1 d2 @3.bias der_d1 der_d2
+#! FIELDS d1 d2 meta1.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/gridx.0.reference b/regtest/basic/rt-mpi7/gridx.0.reference
index 1dd64c8f88e986f942022283fe36f2ea32ae979f..624de6d66113cf0d3bd7134cb59cde19de0eab41 100644
--- a/regtest/basic/rt-mpi7/gridx.0.reference
+++ b/regtest/basic/rt-mpi7/gridx.0.reference
@@ -1,5 +1,5 @@
 # this is to test restart
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -37,7 +37,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -75,7 +75,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -113,7 +113,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -151,7 +151,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/gridx.1.reference b/regtest/basic/rt-mpi7/gridx.1.reference
index f6f7385b4ce179cfac422857e1d998a0a8f24600..137826c6fe4fcfb3074f8ceb782ac7fc7bc63ee3 100644
--- a/regtest/basic/rt-mpi7/gridx.1.reference
+++ b/regtest/basic/rt-mpi7/gridx.1.reference
@@ -1,4 +1,4 @@
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -36,7 +36,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -74,7 +74,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -112,7 +112,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -150,7 +150,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/gridx.2.reference b/regtest/basic/rt-mpi7/gridx.2.reference
index aaccbbd58c3ac40362c64282c1f3644a834d13f1..9c8da6c0d6ff73e9bffb2e169300d57e5a4cf113 100644
--- a/regtest/basic/rt-mpi7/gridx.2.reference
+++ b/regtest/basic/rt-mpi7/gridx.2.reference
@@ -1,4 +1,4 @@
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -36,7 +36,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -74,7 +74,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -112,7 +112,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
@@ -150,7 +150,7 @@
     1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
     1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
     2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
-#! FIELDS d1 d2 @4.bias der_d1 der_d2
+#! FIELDS d1 d2 meta2.bias der_d1 der_d2
 #! SET min_d1 0
 #! SET max_d1 2
 #! SET nbins_d1  5
diff --git a/regtest/basic/rt-mpi7/plumed.dat b/regtest/basic/rt-mpi7/plumed.dat
index af5bbaacca43ed6bc758226f163a35fb849196a3..20044e1cab8d09878a8df1407316740fd1694f27 100644
--- a/regtest/basic/rt-mpi7/plumed.dat
+++ b/regtest/basic/rt-mpi7/plumed.dat
@@ -8,6 +8,7 @@ METAD ...
   FMT=%6.2f
   GRID_MIN=0,0 GRID_MAX=2,2 GRID_BIN=4,4
   GRID_WFILE=grid GRID_WSTRIDE=2
+  LABEL=meta1
 ...
 
 METAD ...
@@ -16,8 +17,16 @@ METAD ...
   GRID_MIN=0,0 GRID_MAX=2,2 GRID_BIN=4,4
   GRID_WFILE=gridx GRID_WSTRIDE=2 STORE_GRIDS
   FILE=HILLSX
+  LABEL=meta2
 ...
 
+METAD ...
+  ARG=d1,d2 SIGMA=0.1,0.1 PACE=2 HEIGHT=0.0
+  FMT=%6.2f
+  GRID_MIN=0,0 GRID_MAX=2,2 GRID_BIN=4,4
+  GRID_RFILE=rgrid
+  FILE=HILLSR
+  LABEL=restart
+...
 
-
-
+PRINT ARG=d1,d2,meta1.bias,meta2.bias,restart.bias FILE=COLVAR
diff --git a/regtest/basic/rt-mpi7/rgrid.0 b/regtest/basic/rt-mpi7/rgrid.0
new file mode 100644
index 0000000000000000000000000000000000000000..d693b74a1945e17f4ef454030551804b5f953b1a
--- /dev/null
+++ b/regtest/basic/rt-mpi7/rgrid.0
@@ -0,0 +1,38 @@
+#! FIELDS d1 d2 restart.bias der_d1 der_d2
+#! SET min_d1 0
+#! SET max_d1 2
+#! SET nbins_d1  5
+#! SET periodic_d1 false
+#! SET min_d2 0
+#! SET max_d2 2
+#! SET nbins_d2  5
+#! SET periodic_d2 false
+    0.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.000000000    0.002715284    0.086233767    0.015955653
+    1.500000000    1.000000000    0.435779639   -3.722399196    5.324145962
+    2.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    1.500000000    0.010483058   -0.012211947   -0.306461908
+    2.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
diff --git a/regtest/basic/rt-mpi7/rgrid.1 b/regtest/basic/rt-mpi7/rgrid.1
new file mode 100644
index 0000000000000000000000000000000000000000..a8975ad3c11dd71847f5f00f6e7832c02538e4a2
--- /dev/null
+++ b/regtest/basic/rt-mpi7/rgrid.1
@@ -0,0 +1,38 @@
+#! FIELDS d1 d2 restart.bias der_d1 der_d2
+#! SET min_d1 0
+#! SET max_d1 2
+#! SET nbins_d1  5
+#! SET periodic_d1 false
+#! SET min_d2 0
+#! SET max_d2 2
+#! SET nbins_d2  5
+#! SET periodic_d2 false
+    0.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.000000000    0.012633933    0.346690534    0.112369568
+    1.500000000    1.000000000    0.406125472   -4.119753516    4.462299304
+    2.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    1.500000000    0.001650351   -0.004046884   -0.055641830
+    2.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
diff --git a/regtest/basic/rt-mpi7/rgrid.2 b/regtest/basic/rt-mpi7/rgrid.2
new file mode 100644
index 0000000000000000000000000000000000000000..9decb0c312aeb6b7660d032493493ddead04ee42
--- /dev/null
+++ b/regtest/basic/rt-mpi7/rgrid.2
@@ -0,0 +1,38 @@
+#! FIELDS d1 d2 restart.bias der_d1 der_d2
+#! SET min_d1 0
+#! SET max_d1 2
+#! SET nbins_d1  5
+#! SET periodic_d1 false
+#! SET min_d2 0
+#! SET max_d2 2
+#! SET nbins_d2  5
+#! SET periodic_d2 false
+    0.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    0.500000000    0.000000000    0.000000000    0.000000000
+    2.000000000    0.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.000000000    0.009918649    0.260456766    0.096413915
+    1.500000000    1.000000000    0.374700112   -2.710607032    5.036795586
+    2.000000000    1.000000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    0.500000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+    1.500000000    1.500000000    0.010483058   -0.012211947   -0.306461908
+    2.000000000    1.500000000    0.000000000    0.000000000    0.000000000
+
+    0.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    0.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.000000000    2.000000000    0.000000000    0.000000000    0.000000000
+    1.500000000    2.000000000    0.000000000    0.000000000    0.000000000
+    2.000000000    2.000000000    0.000000000    0.000000000    0.000000000
diff --git a/regtest/basic/rt11c/HILLS.0.reference b/regtest/basic/rt11c/HILLS.0.reference
new file mode 100644
index 0000000000000000000000000000000000000000..bbd441f5e01fddaee4414c79570ed651385cfb7f
--- /dev/null
+++ b/regtest/basic/rt11c/HILLS.0.reference
@@ -0,0 +1,6 @@
+#! FIELDS time d1 sigma_d1 height biasf
+#! SET multivariate false
+                   0.05      1.130546273059004                    0.1                  0.625                      5
+                    0.1      1.097928292824707                    0.1       0.59908504146288                      5
+                   0.15      1.080244153391634                    0.1     0.5771398513895197                      5
+                    0.2      1.086854650075657                    0.1     0.5463961934914274                      5
diff --git a/regtest/basic/rt11c/HILLS.1.reference b/regtest/basic/rt11c/HILLS.1.reference
new file mode 100644
index 0000000000000000000000000000000000000000..2dd75006507a137720a036f9fa1a4c437943017f
--- /dev/null
+++ b/regtest/basic/rt11c/HILLS.1.reference
@@ -0,0 +1,6 @@
+#! FIELDS time d2 sigma_d2 height biasf
+#! SET multivariate false
+                   0.05      1.203184354540843                    0.2                  0.625                      5
+                    0.1      1.205975816509584                    0.2      0.591373047997768                      5
+                   0.15      1.215791771844345                    0.2     0.5602906936775773                      5
+                    0.2      1.223083017342005                    0.2     0.5395090936346227                      5
diff --git a/regtest/basic/rt11c/Makefile b/regtest/basic/rt11c/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3703b27cea227aa053fb6d1d73f861e4384dbcee
--- /dev/null
+++ b/regtest/basic/rt11c/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/basic/rt11c/config b/regtest/basic/rt11c/config
new file mode 100644
index 0000000000000000000000000000000000000000..9382cd50db6fe358d77400b91dfc77ebd9b6e4a7
--- /dev/null
+++ b/regtest/basic/rt11c/config
@@ -0,0 +1,4 @@
+type=driver
+# this is to test a different name
+arg="--plumed plumed.dat --trajectory-stride 10 --timestep 0.005 --ixyz trajectory.xyz --dump-forces forces --dump-forces-fmt=%10.6f"
+extra_files="../../trajectories/trajectory.xyz"
diff --git a/regtest/basic/rt11c/forces.reference b/regtest/basic/rt11c/forces.reference
new file mode 100644
index 0000000000000000000000000000000000000000..a0e832b4f66f1891b4443930edb67b3e509f9c92
--- /dev/null
+++ b/regtest/basic/rt11c/forces.reference
@@ -0,0 +1,550 @@
+108
+  0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+108
+  0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+108
+  0.367475  -0.010864   0.475798
+X   0.516412  -0.020509  -0.580536
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.516412   0.020509   0.580536
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.011535  -0.012850  -0.001976
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.011535   0.012850   0.001976
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+108
+  0.589949  -0.088856   1.002051
+X   0.966906  -0.110471  -1.197851
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.966906   0.110471   1.197851
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.084559  -0.103973  -0.015775
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.084559   0.103973   0.015775
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+108
+  0.276958  -0.187433   0.793907
+X   0.635241  -0.116324  -0.902847
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.635241   0.116324   0.902847
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X  -0.159723  -0.208169  -0.030112
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.159723   0.208169   0.030112
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
+X   0.000000   0.000000   0.000000
diff --git a/regtest/basic/rt11c/grid0.reference b/regtest/basic/rt11c/grid0.reference
new file mode 100644
index 0000000000000000000000000000000000000000..bfcc759e37c5c7c83bbda5ca3a31e8f929941ebf
--- /dev/null
+++ b/regtest/basic/rt11c/grid0.reference
@@ -0,0 +1,1006 @@
+#! FIELDS d1 md.bias der_d1
+#! SET min_d1 0
+#! SET max_d1 10
+#! SET nbins_d1  1001
+#! SET periodic_d1 false
+    0.000000000    0.000000000    0.000000000
+    0.010000000    0.000000000    0.000000000
+    0.020000000    0.000000000    0.000000000
+    0.030000000    0.000000000    0.000000000
+    0.040000000    0.000000000    0.000000000
+    0.050000000    0.000000000    0.000000000
+    0.060000000    0.000000000    0.000000000
+    0.070000000    0.000000000    0.000000000
+    0.080000000    0.000000000    0.000000000
+    0.090000000    0.000000000    0.000000000
+    0.100000000    0.000000000    0.000000000
+    0.110000000    0.000000000    0.000000000
+    0.120000000    0.000000000    0.000000000
+    0.130000000    0.000000000    0.000000000
+    0.140000000    0.000000000    0.000000000
+    0.150000000    0.000000000    0.000000000
+    0.160000000    0.000000000    0.000000000
+    0.170000000    0.000000000    0.000000000
+    0.180000000    0.000000000    0.000000000
+    0.190000000    0.000000000    0.000000000
+    0.200000000    0.000000000    0.000000000
+    0.210000000    0.000000000    0.000000000
+    0.220000000    0.000000000    0.000000000
+    0.230000000    0.000000000    0.000000000
+    0.240000000    0.000000000    0.000000000
+    0.250000000    0.000000000    0.000000000
+    0.260000000    0.000000000    0.000000000
+    0.270000000    0.000000000    0.000000000
+    0.280000000    0.000000000    0.000000000
+    0.290000000    0.000000000    0.000000000
+    0.300000000    0.000000000    0.000000000
+    0.310000000    0.000000000    0.000000000
+    0.320000000    0.000000000    0.000000000
+    0.330000000    0.000000000    0.000000000
+    0.340000000    0.000000000    0.000000000
+    0.350000000    0.000000000    0.000000000
+    0.360000000    0.000000000    0.000000000
+    0.370000000    0.000000000    0.000000000
+    0.380000000    0.000000000    0.000000000
+    0.390000000    0.000000000    0.000000000
+    0.400000000    0.000000000    0.000000000
+    0.410000000    0.000000000    0.000000000
+    0.420000000    0.000000000    0.000000000
+    0.430000000    0.000000000    0.000000000
+    0.440000000    0.000000000    0.000000000
+    0.450000000    0.000000000    0.000000000
+    0.460000000    0.000000000    0.000000000
+    0.470000000    0.000000000    0.000000000
+    0.480000000    0.000000000    0.000000000
+    0.490000000    0.000000000    0.000000000
+    0.500000000    0.000000000    0.000000000
+    0.510000000    0.000000000    0.000000000
+    0.520000000    0.000000000    0.000000000
+    0.530000000    0.000000000    0.000000000
+    0.540000000    0.000000000    0.000000000
+    0.550000000    0.000000000    0.000000000
+    0.560000000    0.000000000    0.000000000
+    0.570000000    0.000000000    0.000000000
+    0.580000000    0.000000000    0.000000000
+    0.590000000    0.000000000    0.000000000
+    0.600000000    0.000000000    0.000000000
+    0.610000000    0.000000000    0.000000000
+    0.620000000    0.000000000    0.000000000
+    0.630000000    0.000000000    0.000000000
+    0.640000000    0.000000000    0.000000000
+    0.650000000    0.000000000    0.000000000
+    0.660000000    0.000000000    0.000000000
+    0.670000000    0.000000000    0.000000000
+    0.680000000    0.000000000    0.000000000
+    0.690000000    0.000000000    0.000000000
+    0.700000000    0.000000000    0.000000000
+    0.710000000    0.000000000    0.000000000
+    0.720000000    0.000000000    0.000000000
+    0.730000000    0.001001394    0.035073228
+    0.740000000    0.002481242    0.085128094
+    0.750000000    0.004606365    0.155108252
+    0.760000000    0.006418532    0.209741432
+    0.770000000    0.008855057    0.280549338
+    0.780000000    0.013168561    0.408794013
+    0.790000000    0.017874211    0.537338870
+    0.800000000    0.024024612    0.698679184
+    0.810000000    0.031976360    0.898596204
+    0.820000000    0.042145098    1.143080439
+    0.830000000    0.055006351    1.438066518
+    0.840000000    0.071093300    1.789086797
+    0.850000000    0.090990694    2.200845035
+    0.860000000    0.115324128    2.676718887
+    0.870000000    0.144744064    3.218208740
+    0.880000000    0.179904177    3.824360042
+    0.890000000    0.221433941    4.491195814
+    0.900000000    0.269905777    5.211204592
+    0.910000000    0.325797574    5.972935336
+    0.920000000    0.389451914    6.760753686
+    0.930000000    0.461033895    7.554812258
+    0.940000000    0.540489914    8.331280656
+    0.950000000    0.627510201    9.062868052
+    0.960000000    0.721498104    9.719652971
+    0.970000000    0.821549188   10.270211934
+    0.980000000    0.926442985   10.683012704
+    0.990000000    1.034649745   10.928010957
+    1.000000000    1.144353838   10.978364077
+    1.010000000    1.253494423   10.812155129
+    1.020000000    1.359822924   10.414006573
+    1.030000000    1.460975559    9.776459197
+    1.040000000    1.554557999    8.900998384
+    1.050000000    1.638238090    7.798627650
+    1.060000000    1.709841740    6.489917513
+    1.070000000    1.767446502    5.004494244
+    1.080000000    1.809467252    3.379974845
+    1.090000000    1.834728643    1.660397924
+    1.100000000    1.842519701   -0.105759096
+    1.110000000    1.832627036   -1.867851867
+    1.120000000    1.805344503   -3.575481394
+    1.130000000    1.761458700   -5.180886392
+    1.140000000    1.702211311   -6.641157404
+    1.150000000    1.629240799   -7.920129811
+    1.160000000    1.544507278   -8.989840070
+    1.170000000    1.450205345   -9.831465723
+    1.180000000    1.348670278  -10.435711203
+    1.190000000    1.242283134  -10.802644303
+    1.200000000    1.133380071  -10.941028186
+    1.210000000    1.024170542  -10.867227713
+    1.220000000    0.916668137  -10.603793781
+    1.230000000    0.812636704  -10.177843912
+    1.240000000    0.713553167   -9.619360899
+    1.250000000    0.620587289   -8.959524840
+    1.260000000    0.534597516   -8.229178720
+    1.270000000    0.456141163   -7.457506375
+    1.280000000    0.385496523   -6.670976647
+    1.290000000    0.322694073   -5.892581584
+    1.300000000    0.267553811   -5.141371982
+    1.310000000    0.219725812   -4.432272373
+    1.320000000    0.178731378   -3.776141086
+    1.330000000    0.144002545   -3.180029851
+    1.340000000    0.114918215   -2.647591691
+    1.350000000    0.090835679   -2.179585104
+    1.360000000    0.071116824   -1.774425874
+    1.370000000    0.055148764   -1.428744263
+    1.380000000    0.042359016   -1.137913658
+    1.390000000    0.032225640   -0.896525864
+    1.400000000    0.024282964   -0.698797266
+    1.410000000    0.018123620   -0.538898254
+    1.420000000    0.013397689   -0.411205163
+    1.430000000    0.009809683   -0.310479260
+    1.440000000    0.006399646   -0.206278971
+    1.450000000    0.004015515   -0.131456913
+    1.460000000    0.002198159   -0.072419167
+    1.470000000    0.001573286   -0.053405782
+    1.480000000    0.001114842   -0.038958571
+    1.490000000    0.000000000    0.000000000
+    1.500000000    0.000000000    0.000000000
+    1.510000000    0.000000000    0.000000000
+    1.520000000    0.000000000    0.000000000
+    1.530000000    0.000000000    0.000000000
+    1.540000000    0.000000000    0.000000000
+    1.550000000    0.000000000    0.000000000
+    1.560000000    0.000000000    0.000000000
+    1.570000000    0.000000000    0.000000000
+    1.580000000    0.000000000    0.000000000
+    1.590000000    0.000000000    0.000000000
+    1.600000000    0.000000000    0.000000000
+    1.610000000    0.000000000    0.000000000
+    1.620000000    0.000000000    0.000000000
+    1.630000000    0.000000000    0.000000000
+    1.640000000    0.000000000    0.000000000
+    1.650000000    0.000000000    0.000000000
+    1.660000000    0.000000000    0.000000000
+    1.670000000    0.000000000    0.000000000
+    1.680000000    0.000000000    0.000000000
+    1.690000000    0.000000000    0.000000000
+    1.700000000    0.000000000    0.000000000
+    1.710000000    0.000000000    0.000000000
+    1.720000000    0.000000000    0.000000000
+    1.730000000    0.000000000    0.000000000
+    1.740000000    0.000000000    0.000000000
+    1.750000000    0.000000000    0.000000000
+    1.760000000    0.000000000    0.000000000
+    1.770000000    0.000000000    0.000000000
+    1.780000000    0.000000000    0.000000000
+    1.790000000    0.000000000    0.000000000
+    1.800000000    0.000000000    0.000000000
+    1.810000000    0.000000000    0.000000000
+    1.820000000    0.000000000    0.000000000
+    1.830000000    0.000000000    0.000000000
+    1.840000000    0.000000000    0.000000000
+    1.850000000    0.000000000    0.000000000
+    1.860000000    0.000000000    0.000000000
+    1.870000000    0.000000000    0.000000000
+    1.880000000    0.000000000    0.000000000
+    1.890000000    0.000000000    0.000000000
+    1.900000000    0.000000000    0.000000000
+    1.910000000    0.000000000    0.000000000
+    1.920000000    0.000000000    0.000000000
+    1.930000000    0.000000000    0.000000000
+    1.940000000    0.000000000    0.000000000
+    1.950000000    0.000000000    0.000000000
+    1.960000000    0.000000000    0.000000000
+    1.970000000    0.000000000    0.000000000
+    1.980000000    0.000000000    0.000000000
+    1.990000000    0.000000000    0.000000000
+    2.000000000    0.000000000    0.000000000
+    2.010000000    0.000000000    0.000000000
+    2.020000000    0.000000000    0.000000000
+    2.030000000    0.000000000    0.000000000
+    2.040000000    0.000000000    0.000000000
+    2.050000000    0.000000000    0.000000000
+    2.060000000    0.000000000    0.000000000
+    2.070000000    0.000000000    0.000000000
+    2.080000000    0.000000000    0.000000000
+    2.090000000    0.000000000    0.000000000
+    2.100000000    0.000000000    0.000000000
+    2.110000000    0.000000000    0.000000000
+    2.120000000    0.000000000    0.000000000
+    2.130000000    0.000000000    0.000000000
+    2.140000000    0.000000000    0.000000000
+    2.150000000    0.000000000    0.000000000
+    2.160000000    0.000000000    0.000000000
+    2.170000000    0.000000000    0.000000000
+    2.180000000    0.000000000    0.000000000
+    2.190000000    0.000000000    0.000000000
+    2.200000000    0.000000000    0.000000000
+    2.210000000    0.000000000    0.000000000
+    2.220000000    0.000000000    0.000000000
+    2.230000000    0.000000000    0.000000000
+    2.240000000    0.000000000    0.000000000
+    2.250000000    0.000000000    0.000000000
+    2.260000000    0.000000000    0.000000000
+    2.270000000    0.000000000    0.000000000
+    2.280000000    0.000000000    0.000000000
+    2.290000000    0.000000000    0.000000000
+    2.300000000    0.000000000    0.000000000
+    2.310000000    0.000000000    0.000000000
+    2.320000000    0.000000000    0.000000000
+    2.330000000    0.000000000    0.000000000
+    2.340000000    0.000000000    0.000000000
+    2.350000000    0.000000000    0.000000000
+    2.360000000    0.000000000    0.000000000
+    2.370000000    0.000000000    0.000000000
+    2.380000000    0.000000000    0.000000000
+    2.390000000    0.000000000    0.000000000
+    2.400000000    0.000000000    0.000000000
+    2.410000000    0.000000000    0.000000000
+    2.420000000    0.000000000    0.000000000
+    2.430000000    0.000000000    0.000000000
+    2.440000000    0.000000000    0.000000000
+    2.450000000    0.000000000    0.000000000
+    2.460000000    0.000000000    0.000000000
+    2.470000000    0.000000000    0.000000000
+    2.480000000    0.000000000    0.000000000
+    2.490000000    0.000000000    0.000000000
+    2.500000000    0.000000000    0.000000000
+    2.510000000    0.000000000    0.000000000
+    2.520000000    0.000000000    0.000000000
+    2.530000000    0.000000000    0.000000000
+    2.540000000    0.000000000    0.000000000
+    2.550000000    0.000000000    0.000000000
+    2.560000000    0.000000000    0.000000000
+    2.570000000    0.000000000    0.000000000
+    2.580000000    0.000000000    0.000000000
+    2.590000000    0.000000000    0.000000000
+    2.600000000    0.000000000    0.000000000
+    2.610000000    0.000000000    0.000000000
+    2.620000000    0.000000000    0.000000000
+    2.630000000    0.000000000    0.000000000
+    2.640000000    0.000000000    0.000000000
+    2.650000000    0.000000000    0.000000000
+    2.660000000    0.000000000    0.000000000
+    2.670000000    0.000000000    0.000000000
+    2.680000000    0.000000000    0.000000000
+    2.690000000    0.000000000    0.000000000
+    2.700000000    0.000000000    0.000000000
+    2.710000000    0.000000000    0.000000000
+    2.720000000    0.000000000    0.000000000
+    2.730000000    0.000000000    0.000000000
+    2.740000000    0.000000000    0.000000000
+    2.750000000    0.000000000    0.000000000
+    2.760000000    0.000000000    0.000000000
+    2.770000000    0.000000000    0.000000000
+    2.780000000    0.000000000    0.000000000
+    2.790000000    0.000000000    0.000000000
+    2.800000000    0.000000000    0.000000000
+    2.810000000    0.000000000    0.000000000
+    2.820000000    0.000000000    0.000000000
+    2.830000000    0.000000000    0.000000000
+    2.840000000    0.000000000    0.000000000
+    2.850000000    0.000000000    0.000000000
+    2.860000000    0.000000000    0.000000000
+    2.870000000    0.000000000    0.000000000
+    2.880000000    0.000000000    0.000000000
+    2.890000000    0.000000000    0.000000000
+    2.900000000    0.000000000    0.000000000
+    2.910000000    0.000000000    0.000000000
+    2.920000000    0.000000000    0.000000000
+    2.930000000    0.000000000    0.000000000
+    2.940000000    0.000000000    0.000000000
+    2.950000000    0.000000000    0.000000000
+    2.960000000    0.000000000    0.000000000
+    2.970000000    0.000000000    0.000000000
+    2.980000000    0.000000000    0.000000000
+    2.990000000    0.000000000    0.000000000
+    3.000000000    0.000000000    0.000000000
+    3.010000000    0.000000000    0.000000000
+    3.020000000    0.000000000    0.000000000
+    3.030000000    0.000000000    0.000000000
+    3.040000000    0.000000000    0.000000000
+    3.050000000    0.000000000    0.000000000
+    3.060000000    0.000000000    0.000000000
+    3.070000000    0.000000000    0.000000000
+    3.080000000    0.000000000    0.000000000
+    3.090000000    0.000000000    0.000000000
+    3.100000000    0.000000000    0.000000000
+    3.110000000    0.000000000    0.000000000
+    3.120000000    0.000000000    0.000000000
+    3.130000000    0.000000000    0.000000000
+    3.140000000    0.000000000    0.000000000
+    3.150000000    0.000000000    0.000000000
+    3.160000000    0.000000000    0.000000000
+    3.170000000    0.000000000    0.000000000
+    3.180000000    0.000000000    0.000000000
+    3.190000000    0.000000000    0.000000000
+    3.200000000    0.000000000    0.000000000
+    3.210000000    0.000000000    0.000000000
+    3.220000000    0.000000000    0.000000000
+    3.230000000    0.000000000    0.000000000
+    3.240000000    0.000000000    0.000000000
+    3.250000000    0.000000000    0.000000000
+    3.260000000    0.000000000    0.000000000
+    3.270000000    0.000000000    0.000000000
+    3.280000000    0.000000000    0.000000000
+    3.290000000    0.000000000    0.000000000
+    3.300000000    0.000000000    0.000000000
+    3.310000000    0.000000000    0.000000000
+    3.320000000    0.000000000    0.000000000
+    3.330000000    0.000000000    0.000000000
+    3.340000000    0.000000000    0.000000000
+    3.350000000    0.000000000    0.000000000
+    3.360000000    0.000000000    0.000000000
+    3.370000000    0.000000000    0.000000000
+    3.380000000    0.000000000    0.000000000
+    3.390000000    0.000000000    0.000000000
+    3.400000000    0.000000000    0.000000000
+    3.410000000    0.000000000    0.000000000
+    3.420000000    0.000000000    0.000000000
+    3.430000000    0.000000000    0.000000000
+    3.440000000    0.000000000    0.000000000
+    3.450000000    0.000000000    0.000000000
+    3.460000000    0.000000000    0.000000000
+    3.470000000    0.000000000    0.000000000
+    3.480000000    0.000000000    0.000000000
+    3.490000000    0.000000000    0.000000000
+    3.500000000    0.000000000    0.000000000
+    3.510000000    0.000000000    0.000000000
+    3.520000000    0.000000000    0.000000000
+    3.530000000    0.000000000    0.000000000
+    3.540000000    0.000000000    0.000000000
+    3.550000000    0.000000000    0.000000000
+    3.560000000    0.000000000    0.000000000
+    3.570000000    0.000000000    0.000000000
+    3.580000000    0.000000000    0.000000000
+    3.590000000    0.000000000    0.000000000
+    3.600000000    0.000000000    0.000000000
+    3.610000000    0.000000000    0.000000000
+    3.620000000    0.000000000    0.000000000
+    3.630000000    0.000000000    0.000000000
+    3.640000000    0.000000000    0.000000000
+    3.650000000    0.000000000    0.000000000
+    3.660000000    0.000000000    0.000000000
+    3.670000000    0.000000000    0.000000000
+    3.680000000    0.000000000    0.000000000
+    3.690000000    0.000000000    0.000000000
+    3.700000000    0.000000000    0.000000000
+    3.710000000    0.000000000    0.000000000
+    3.720000000    0.000000000    0.000000000
+    3.730000000    0.000000000    0.000000000
+    3.740000000    0.000000000    0.000000000
+    3.750000000    0.000000000    0.000000000
+    3.760000000    0.000000000    0.000000000
+    3.770000000    0.000000000    0.000000000
+    3.780000000    0.000000000    0.000000000
+    3.790000000    0.000000000    0.000000000
+    3.800000000    0.000000000    0.000000000
+    3.810000000    0.000000000    0.000000000
+    3.820000000    0.000000000    0.000000000
+    3.830000000    0.000000000    0.000000000
+    3.840000000    0.000000000    0.000000000
+    3.850000000    0.000000000    0.000000000
+    3.860000000    0.000000000    0.000000000
+    3.870000000    0.000000000    0.000000000
+    3.880000000    0.000000000    0.000000000
+    3.890000000    0.000000000    0.000000000
+    3.900000000    0.000000000    0.000000000
+    3.910000000    0.000000000    0.000000000
+    3.920000000    0.000000000    0.000000000
+    3.930000000    0.000000000    0.000000000
+    3.940000000    0.000000000    0.000000000
+    3.950000000    0.000000000    0.000000000
+    3.960000000    0.000000000    0.000000000
+    3.970000000    0.000000000    0.000000000
+    3.980000000    0.000000000    0.000000000
+    3.990000000    0.000000000    0.000000000
+    4.000000000    0.000000000    0.000000000
+    4.010000000    0.000000000    0.000000000
+    4.020000000    0.000000000    0.000000000
+    4.030000000    0.000000000    0.000000000
+    4.040000000    0.000000000    0.000000000
+    4.050000000    0.000000000    0.000000000
+    4.060000000    0.000000000    0.000000000
+    4.070000000    0.000000000    0.000000000
+    4.080000000    0.000000000    0.000000000
+    4.090000000    0.000000000    0.000000000
+    4.100000000    0.000000000    0.000000000
+    4.110000000    0.000000000    0.000000000
+    4.120000000    0.000000000    0.000000000
+    4.130000000    0.000000000    0.000000000
+    4.140000000    0.000000000    0.000000000
+    4.150000000    0.000000000    0.000000000
+    4.160000000    0.000000000    0.000000000
+    4.170000000    0.000000000    0.000000000
+    4.180000000    0.000000000    0.000000000
+    4.190000000    0.000000000    0.000000000
+    4.200000000    0.000000000    0.000000000
+    4.210000000    0.000000000    0.000000000
+    4.220000000    0.000000000    0.000000000
+    4.230000000    0.000000000    0.000000000
+    4.240000000    0.000000000    0.000000000
+    4.250000000    0.000000000    0.000000000
+    4.260000000    0.000000000    0.000000000
+    4.270000000    0.000000000    0.000000000
+    4.280000000    0.000000000    0.000000000
+    4.290000000    0.000000000    0.000000000
+    4.300000000    0.000000000    0.000000000
+    4.310000000    0.000000000    0.000000000
+    4.320000000    0.000000000    0.000000000
+    4.330000000    0.000000000    0.000000000
+    4.340000000    0.000000000    0.000000000
+    4.350000000    0.000000000    0.000000000
+    4.360000000    0.000000000    0.000000000
+    4.370000000    0.000000000    0.000000000
+    4.380000000    0.000000000    0.000000000
+    4.390000000    0.000000000    0.000000000
+    4.400000000    0.000000000    0.000000000
+    4.410000000    0.000000000    0.000000000
+    4.420000000    0.000000000    0.000000000
+    4.430000000    0.000000000    0.000000000
+    4.440000000    0.000000000    0.000000000
+    4.450000000    0.000000000    0.000000000
+    4.460000000    0.000000000    0.000000000
+    4.470000000    0.000000000    0.000000000
+    4.480000000    0.000000000    0.000000000
+    4.490000000    0.000000000    0.000000000
+    4.500000000    0.000000000    0.000000000
+    4.510000000    0.000000000    0.000000000
+    4.520000000    0.000000000    0.000000000
+    4.530000000    0.000000000    0.000000000
+    4.540000000    0.000000000    0.000000000
+    4.550000000    0.000000000    0.000000000
+    4.560000000    0.000000000    0.000000000
+    4.570000000    0.000000000    0.000000000
+    4.580000000    0.000000000    0.000000000
+    4.590000000    0.000000000    0.000000000
+    4.600000000    0.000000000    0.000000000
+    4.610000000    0.000000000    0.000000000
+    4.620000000    0.000000000    0.000000000
+    4.630000000    0.000000000    0.000000000
+    4.640000000    0.000000000    0.000000000
+    4.650000000    0.000000000    0.000000000
+    4.660000000    0.000000000    0.000000000
+    4.670000000    0.000000000    0.000000000
+    4.680000000    0.000000000    0.000000000
+    4.690000000    0.000000000    0.000000000
+    4.700000000    0.000000000    0.000000000
+    4.710000000    0.000000000    0.000000000
+    4.720000000    0.000000000    0.000000000
+    4.730000000    0.000000000    0.000000000
+    4.740000000    0.000000000    0.000000000
+    4.750000000    0.000000000    0.000000000
+    4.760000000    0.000000000    0.000000000
+    4.770000000    0.000000000    0.000000000
+    4.780000000    0.000000000    0.000000000
+    4.790000000    0.000000000    0.000000000
+    4.800000000    0.000000000    0.000000000
+    4.810000000    0.000000000    0.000000000
+    4.820000000    0.000000000    0.000000000
+    4.830000000    0.000000000    0.000000000
+    4.840000000    0.000000000    0.000000000
+    4.850000000    0.000000000    0.000000000
+    4.860000000    0.000000000    0.000000000
+    4.870000000    0.000000000    0.000000000
+    4.880000000    0.000000000    0.000000000
+    4.890000000    0.000000000    0.000000000
+    4.900000000    0.000000000    0.000000000
+    4.910000000    0.000000000    0.000000000
+    4.920000000    0.000000000    0.000000000
+    4.930000000    0.000000000    0.000000000
+    4.940000000    0.000000000    0.000000000
+    4.950000000    0.000000000    0.000000000
+    4.960000000    0.000000000    0.000000000
+    4.970000000    0.000000000    0.000000000
+    4.980000000    0.000000000    0.000000000
+    4.990000000    0.000000000    0.000000000
+    5.000000000    0.000000000    0.000000000
+    5.010000000    0.000000000    0.000000000
+    5.020000000    0.000000000    0.000000000
+    5.030000000    0.000000000    0.000000000
+    5.040000000    0.000000000    0.000000000
+    5.050000000    0.000000000    0.000000000
+    5.060000000    0.000000000    0.000000000
+    5.070000000    0.000000000    0.000000000
+    5.080000000    0.000000000    0.000000000
+    5.090000000    0.000000000    0.000000000
+    5.100000000    0.000000000    0.000000000
+    5.110000000    0.000000000    0.000000000
+    5.120000000    0.000000000    0.000000000
+    5.130000000    0.000000000    0.000000000
+    5.140000000    0.000000000    0.000000000
+    5.150000000    0.000000000    0.000000000
+    5.160000000    0.000000000    0.000000000
+    5.170000000    0.000000000    0.000000000
+    5.180000000    0.000000000    0.000000000
+    5.190000000    0.000000000    0.000000000
+    5.200000000    0.000000000    0.000000000
+    5.210000000    0.000000000    0.000000000
+    5.220000000    0.000000000    0.000000000
+    5.230000000    0.000000000    0.000000000
+    5.240000000    0.000000000    0.000000000
+    5.250000000    0.000000000    0.000000000
+    5.260000000    0.000000000    0.000000000
+    5.270000000    0.000000000    0.000000000
+    5.280000000    0.000000000    0.000000000
+    5.290000000    0.000000000    0.000000000
+    5.300000000    0.000000000    0.000000000
+    5.310000000    0.000000000    0.000000000
+    5.320000000    0.000000000    0.000000000
+    5.330000000    0.000000000    0.000000000
+    5.340000000    0.000000000    0.000000000
+    5.350000000    0.000000000    0.000000000
+    5.360000000    0.000000000    0.000000000
+    5.370000000    0.000000000    0.000000000
+    5.380000000    0.000000000    0.000000000
+    5.390000000    0.000000000    0.000000000
+    5.400000000    0.000000000    0.000000000
+    5.410000000    0.000000000    0.000000000
+    5.420000000    0.000000000    0.000000000
+    5.430000000    0.000000000    0.000000000
+    5.440000000    0.000000000    0.000000000
+    5.450000000    0.000000000    0.000000000
+    5.460000000    0.000000000    0.000000000
+    5.470000000    0.000000000    0.000000000
+    5.480000000    0.000000000    0.000000000
+    5.490000000    0.000000000    0.000000000
+    5.500000000    0.000000000    0.000000000
+    5.510000000    0.000000000    0.000000000
+    5.520000000    0.000000000    0.000000000
+    5.530000000    0.000000000    0.000000000
+    5.540000000    0.000000000    0.000000000
+    5.550000000    0.000000000    0.000000000
+    5.560000000    0.000000000    0.000000000
+    5.570000000    0.000000000    0.000000000
+    5.580000000    0.000000000    0.000000000
+    5.590000000    0.000000000    0.000000000
+    5.600000000    0.000000000    0.000000000
+    5.610000000    0.000000000    0.000000000
+    5.620000000    0.000000000    0.000000000
+    5.630000000    0.000000000    0.000000000
+    5.640000000    0.000000000    0.000000000
+    5.650000000    0.000000000    0.000000000
+    5.660000000    0.000000000    0.000000000
+    5.670000000    0.000000000    0.000000000
+    5.680000000    0.000000000    0.000000000
+    5.690000000    0.000000000    0.000000000
+    5.700000000    0.000000000    0.000000000
+    5.710000000    0.000000000    0.000000000
+    5.720000000    0.000000000    0.000000000
+    5.730000000    0.000000000    0.000000000
+    5.740000000    0.000000000    0.000000000
+    5.750000000    0.000000000    0.000000000
+    5.760000000    0.000000000    0.000000000
+    5.770000000    0.000000000    0.000000000
+    5.780000000    0.000000000    0.000000000
+    5.790000000    0.000000000    0.000000000
+    5.800000000    0.000000000    0.000000000
+    5.810000000    0.000000000    0.000000000
+    5.820000000    0.000000000    0.000000000
+    5.830000000    0.000000000    0.000000000
+    5.840000000    0.000000000    0.000000000
+    5.850000000    0.000000000    0.000000000
+    5.860000000    0.000000000    0.000000000
+    5.870000000    0.000000000    0.000000000
+    5.880000000    0.000000000    0.000000000
+    5.890000000    0.000000000    0.000000000
+    5.900000000    0.000000000    0.000000000
+    5.910000000    0.000000000    0.000000000
+    5.920000000    0.000000000    0.000000000
+    5.930000000    0.000000000    0.000000000
+    5.940000000    0.000000000    0.000000000
+    5.950000000    0.000000000    0.000000000
+    5.960000000    0.000000000    0.000000000
+    5.970000000    0.000000000    0.000000000
+    5.980000000    0.000000000    0.000000000
+    5.990000000    0.000000000    0.000000000
+    6.000000000    0.000000000    0.000000000
+    6.010000000    0.000000000    0.000000000
+    6.020000000    0.000000000    0.000000000
+    6.030000000    0.000000000    0.000000000
+    6.040000000    0.000000000    0.000000000
+    6.050000000    0.000000000    0.000000000
+    6.060000000    0.000000000    0.000000000
+    6.070000000    0.000000000    0.000000000
+    6.080000000    0.000000000    0.000000000
+    6.090000000    0.000000000    0.000000000
+    6.100000000    0.000000000    0.000000000
+    6.110000000    0.000000000    0.000000000
+    6.120000000    0.000000000    0.000000000
+    6.130000000    0.000000000    0.000000000
+    6.140000000    0.000000000    0.000000000
+    6.150000000    0.000000000    0.000000000
+    6.160000000    0.000000000    0.000000000
+    6.170000000    0.000000000    0.000000000
+    6.180000000    0.000000000    0.000000000
+    6.190000000    0.000000000    0.000000000
+    6.200000000    0.000000000    0.000000000
+    6.210000000    0.000000000    0.000000000
+    6.220000000    0.000000000    0.000000000
+    6.230000000    0.000000000    0.000000000
+    6.240000000    0.000000000    0.000000000
+    6.250000000    0.000000000    0.000000000
+    6.260000000    0.000000000    0.000000000
+    6.270000000    0.000000000    0.000000000
+    6.280000000    0.000000000    0.000000000
+    6.290000000    0.000000000    0.000000000
+    6.300000000    0.000000000    0.000000000
+    6.310000000    0.000000000    0.000000000
+    6.320000000    0.000000000    0.000000000
+    6.330000000    0.000000000    0.000000000
+    6.340000000    0.000000000    0.000000000
+    6.350000000    0.000000000    0.000000000
+    6.360000000    0.000000000    0.000000000
+    6.370000000    0.000000000    0.000000000
+    6.380000000    0.000000000    0.000000000
+    6.390000000    0.000000000    0.000000000
+    6.400000000    0.000000000    0.000000000
+    6.410000000    0.000000000    0.000000000
+    6.420000000    0.000000000    0.000000000
+    6.430000000    0.000000000    0.000000000
+    6.440000000    0.000000000    0.000000000
+    6.450000000    0.000000000    0.000000000
+    6.460000000    0.000000000    0.000000000
+    6.470000000    0.000000000    0.000000000
+    6.480000000    0.000000000    0.000000000
+    6.490000000    0.000000000    0.000000000
+    6.500000000    0.000000000    0.000000000
+    6.510000000    0.000000000    0.000000000
+    6.520000000    0.000000000    0.000000000
+    6.530000000    0.000000000    0.000000000
+    6.540000000    0.000000000    0.000000000
+    6.550000000    0.000000000    0.000000000
+    6.560000000    0.000000000    0.000000000
+    6.570000000    0.000000000    0.000000000
+    6.580000000    0.000000000    0.000000000
+    6.590000000    0.000000000    0.000000000
+    6.600000000    0.000000000    0.000000000
+    6.610000000    0.000000000    0.000000000
+    6.620000000    0.000000000    0.000000000
+    6.630000000    0.000000000    0.000000000
+    6.640000000    0.000000000    0.000000000
+    6.650000000    0.000000000    0.000000000
+    6.660000000    0.000000000    0.000000000
+    6.670000000    0.000000000    0.000000000
+    6.680000000    0.000000000    0.000000000
+    6.690000000    0.000000000    0.000000000
+    6.700000000    0.000000000    0.000000000
+    6.710000000    0.000000000    0.000000000
+    6.720000000    0.000000000    0.000000000
+    6.730000000    0.000000000    0.000000000
+    6.740000000    0.000000000    0.000000000
+    6.750000000    0.000000000    0.000000000
+    6.760000000    0.000000000    0.000000000
+    6.770000000    0.000000000    0.000000000
+    6.780000000    0.000000000    0.000000000
+    6.790000000    0.000000000    0.000000000
+    6.800000000    0.000000000    0.000000000
+    6.810000000    0.000000000    0.000000000
+    6.820000000    0.000000000    0.000000000
+    6.830000000    0.000000000    0.000000000
+    6.840000000    0.000000000    0.000000000
+    6.850000000    0.000000000    0.000000000
+    6.860000000    0.000000000    0.000000000
+    6.870000000    0.000000000    0.000000000
+    6.880000000    0.000000000    0.000000000
+    6.890000000    0.000000000    0.000000000
+    6.900000000    0.000000000    0.000000000
+    6.910000000    0.000000000    0.000000000
+    6.920000000    0.000000000    0.000000000
+    6.930000000    0.000000000    0.000000000
+    6.940000000    0.000000000    0.000000000
+    6.950000000    0.000000000    0.000000000
+    6.960000000    0.000000000    0.000000000
+    6.970000000    0.000000000    0.000000000
+    6.980000000    0.000000000    0.000000000
+    6.990000000    0.000000000    0.000000000
+    7.000000000    0.000000000    0.000000000
+    7.010000000    0.000000000    0.000000000
+    7.020000000    0.000000000    0.000000000
+    7.030000000    0.000000000    0.000000000
+    7.040000000    0.000000000    0.000000000
+    7.050000000    0.000000000    0.000000000
+    7.060000000    0.000000000    0.000000000
+    7.070000000    0.000000000    0.000000000
+    7.080000000    0.000000000    0.000000000
+    7.090000000    0.000000000    0.000000000
+    7.100000000    0.000000000    0.000000000
+    7.110000000    0.000000000    0.000000000
+    7.120000000    0.000000000    0.000000000
+    7.130000000    0.000000000    0.000000000
+    7.140000000    0.000000000    0.000000000
+    7.150000000    0.000000000    0.000000000
+    7.160000000    0.000000000    0.000000000
+    7.170000000    0.000000000    0.000000000
+    7.180000000    0.000000000    0.000000000
+    7.190000000    0.000000000    0.000000000
+    7.200000000    0.000000000    0.000000000
+    7.210000000    0.000000000    0.000000000
+    7.220000000    0.000000000    0.000000000
+    7.230000000    0.000000000    0.000000000
+    7.240000000    0.000000000    0.000000000
+    7.250000000    0.000000000    0.000000000
+    7.260000000    0.000000000    0.000000000
+    7.270000000    0.000000000    0.000000000
+    7.280000000    0.000000000    0.000000000
+    7.290000000    0.000000000    0.000000000
+    7.300000000    0.000000000    0.000000000
+    7.310000000    0.000000000    0.000000000
+    7.320000000    0.000000000    0.000000000
+    7.330000000    0.000000000    0.000000000
+    7.340000000    0.000000000    0.000000000
+    7.350000000    0.000000000    0.000000000
+    7.360000000    0.000000000    0.000000000
+    7.370000000    0.000000000    0.000000000
+    7.380000000    0.000000000    0.000000000
+    7.390000000    0.000000000    0.000000000
+    7.400000000    0.000000000    0.000000000
+    7.410000000    0.000000000    0.000000000
+    7.420000000    0.000000000    0.000000000
+    7.430000000    0.000000000    0.000000000
+    7.440000000    0.000000000    0.000000000
+    7.450000000    0.000000000    0.000000000
+    7.460000000    0.000000000    0.000000000
+    7.470000000    0.000000000    0.000000000
+    7.480000000    0.000000000    0.000000000
+    7.490000000    0.000000000    0.000000000
+    7.500000000    0.000000000    0.000000000
+    7.510000000    0.000000000    0.000000000
+    7.520000000    0.000000000    0.000000000
+    7.530000000    0.000000000    0.000000000
+    7.540000000    0.000000000    0.000000000
+    7.550000000    0.000000000    0.000000000
+    7.560000000    0.000000000    0.000000000
+    7.570000000    0.000000000    0.000000000
+    7.580000000    0.000000000    0.000000000
+    7.590000000    0.000000000    0.000000000
+    7.600000000    0.000000000    0.000000000
+    7.610000000    0.000000000    0.000000000
+    7.620000000    0.000000000    0.000000000
+    7.630000000    0.000000000    0.000000000
+    7.640000000    0.000000000    0.000000000
+    7.650000000    0.000000000    0.000000000
+    7.660000000    0.000000000    0.000000000
+    7.670000000    0.000000000    0.000000000
+    7.680000000    0.000000000    0.000000000
+    7.690000000    0.000000000    0.000000000
+    7.700000000    0.000000000    0.000000000
+    7.710000000    0.000000000    0.000000000
+    7.720000000    0.000000000    0.000000000
+    7.730000000    0.000000000    0.000000000
+    7.740000000    0.000000000    0.000000000
+    7.750000000    0.000000000    0.000000000
+    7.760000000    0.000000000    0.000000000
+    7.770000000    0.000000000    0.000000000
+    7.780000000    0.000000000    0.000000000
+    7.790000000    0.000000000    0.000000000
+    7.800000000    0.000000000    0.000000000
+    7.810000000    0.000000000    0.000000000
+    7.820000000    0.000000000    0.000000000
+    7.830000000    0.000000000    0.000000000
+    7.840000000    0.000000000    0.000000000
+    7.850000000    0.000000000    0.000000000
+    7.860000000    0.000000000    0.000000000
+    7.870000000    0.000000000    0.000000000
+    7.880000000    0.000000000    0.000000000
+    7.890000000    0.000000000    0.000000000
+    7.900000000    0.000000000    0.000000000
+    7.910000000    0.000000000    0.000000000
+    7.920000000    0.000000000    0.000000000
+    7.930000000    0.000000000    0.000000000
+    7.940000000    0.000000000    0.000000000
+    7.950000000    0.000000000    0.000000000
+    7.960000000    0.000000000    0.000000000
+    7.970000000    0.000000000    0.000000000
+    7.980000000    0.000000000    0.000000000
+    7.990000000    0.000000000    0.000000000
+    8.000000000    0.000000000    0.000000000
+    8.010000000    0.000000000    0.000000000
+    8.020000000    0.000000000    0.000000000
+    8.030000000    0.000000000    0.000000000
+    8.040000000    0.000000000    0.000000000
+    8.050000000    0.000000000    0.000000000
+    8.060000000    0.000000000    0.000000000
+    8.070000000    0.000000000    0.000000000
+    8.080000000    0.000000000    0.000000000
+    8.090000000    0.000000000    0.000000000
+    8.100000000    0.000000000    0.000000000
+    8.110000000    0.000000000    0.000000000
+    8.120000000    0.000000000    0.000000000
+    8.130000000    0.000000000    0.000000000
+    8.140000000    0.000000000    0.000000000
+    8.150000000    0.000000000    0.000000000
+    8.160000000    0.000000000    0.000000000
+    8.170000000    0.000000000    0.000000000
+    8.180000000    0.000000000    0.000000000
+    8.190000000    0.000000000    0.000000000
+    8.200000000    0.000000000    0.000000000
+    8.210000000    0.000000000    0.000000000
+    8.220000000    0.000000000    0.000000000
+    8.230000000    0.000000000    0.000000000
+    8.240000000    0.000000000    0.000000000
+    8.250000000    0.000000000    0.000000000
+    8.260000000    0.000000000    0.000000000
+    8.270000000    0.000000000    0.000000000
+    8.280000000    0.000000000    0.000000000
+    8.290000000    0.000000000    0.000000000
+    8.300000000    0.000000000    0.000000000
+    8.310000000    0.000000000    0.000000000
+    8.320000000    0.000000000    0.000000000
+    8.330000000    0.000000000    0.000000000
+    8.340000000    0.000000000    0.000000000
+    8.350000000    0.000000000    0.000000000
+    8.360000000    0.000000000    0.000000000
+    8.370000000    0.000000000    0.000000000
+    8.380000000    0.000000000    0.000000000
+    8.390000000    0.000000000    0.000000000
+    8.400000000    0.000000000    0.000000000
+    8.410000000    0.000000000    0.000000000
+    8.420000000    0.000000000    0.000000000
+    8.430000000    0.000000000    0.000000000
+    8.440000000    0.000000000    0.000000000
+    8.450000000    0.000000000    0.000000000
+    8.460000000    0.000000000    0.000000000
+    8.470000000    0.000000000    0.000000000
+    8.480000000    0.000000000    0.000000000
+    8.490000000    0.000000000    0.000000000
+    8.500000000    0.000000000    0.000000000
+    8.510000000    0.000000000    0.000000000
+    8.520000000    0.000000000    0.000000000
+    8.530000000    0.000000000    0.000000000
+    8.540000000    0.000000000    0.000000000
+    8.550000000    0.000000000    0.000000000
+    8.560000000    0.000000000    0.000000000
+    8.570000000    0.000000000    0.000000000
+    8.580000000    0.000000000    0.000000000
+    8.590000000    0.000000000    0.000000000
+    8.600000000    0.000000000    0.000000000
+    8.610000000    0.000000000    0.000000000
+    8.620000000    0.000000000    0.000000000
+    8.630000000    0.000000000    0.000000000
+    8.640000000    0.000000000    0.000000000
+    8.650000000    0.000000000    0.000000000
+    8.660000000    0.000000000    0.000000000
+    8.670000000    0.000000000    0.000000000
+    8.680000000    0.000000000    0.000000000
+    8.690000000    0.000000000    0.000000000
+    8.700000000    0.000000000    0.000000000
+    8.710000000    0.000000000    0.000000000
+    8.720000000    0.000000000    0.000000000
+    8.730000000    0.000000000    0.000000000
+    8.740000000    0.000000000    0.000000000
+    8.750000000    0.000000000    0.000000000
+    8.760000000    0.000000000    0.000000000
+    8.770000000    0.000000000    0.000000000
+    8.780000000    0.000000000    0.000000000
+    8.790000000    0.000000000    0.000000000
+    8.800000000    0.000000000    0.000000000
+    8.810000000    0.000000000    0.000000000
+    8.820000000    0.000000000    0.000000000
+    8.830000000    0.000000000    0.000000000
+    8.840000000    0.000000000    0.000000000
+    8.850000000    0.000000000    0.000000000
+    8.860000000    0.000000000    0.000000000
+    8.870000000    0.000000000    0.000000000
+    8.880000000    0.000000000    0.000000000
+    8.890000000    0.000000000    0.000000000
+    8.900000000    0.000000000    0.000000000
+    8.910000000    0.000000000    0.000000000
+    8.920000000    0.000000000    0.000000000
+    8.930000000    0.000000000    0.000000000
+    8.940000000    0.000000000    0.000000000
+    8.950000000    0.000000000    0.000000000
+    8.960000000    0.000000000    0.000000000
+    8.970000000    0.000000000    0.000000000
+    8.980000000    0.000000000    0.000000000
+    8.990000000    0.000000000    0.000000000
+    9.000000000    0.000000000    0.000000000
+    9.010000000    0.000000000    0.000000000
+    9.020000000    0.000000000    0.000000000
+    9.030000000    0.000000000    0.000000000
+    9.040000000    0.000000000    0.000000000
+    9.050000000    0.000000000    0.000000000
+    9.060000000    0.000000000    0.000000000
+    9.070000000    0.000000000    0.000000000
+    9.080000000    0.000000000    0.000000000
+    9.090000000    0.000000000    0.000000000
+    9.100000000    0.000000000    0.000000000
+    9.110000000    0.000000000    0.000000000
+    9.120000000    0.000000000    0.000000000
+    9.130000000    0.000000000    0.000000000
+    9.140000000    0.000000000    0.000000000
+    9.150000000    0.000000000    0.000000000
+    9.160000000    0.000000000    0.000000000
+    9.170000000    0.000000000    0.000000000
+    9.180000000    0.000000000    0.000000000
+    9.190000000    0.000000000    0.000000000
+    9.200000000    0.000000000    0.000000000
+    9.210000000    0.000000000    0.000000000
+    9.220000000    0.000000000    0.000000000
+    9.230000000    0.000000000    0.000000000
+    9.240000000    0.000000000    0.000000000
+    9.250000000    0.000000000    0.000000000
+    9.260000000    0.000000000    0.000000000
+    9.270000000    0.000000000    0.000000000
+    9.280000000    0.000000000    0.000000000
+    9.290000000    0.000000000    0.000000000
+    9.300000000    0.000000000    0.000000000
+    9.310000000    0.000000000    0.000000000
+    9.320000000    0.000000000    0.000000000
+    9.330000000    0.000000000    0.000000000
+    9.340000000    0.000000000    0.000000000
+    9.350000000    0.000000000    0.000000000
+    9.360000000    0.000000000    0.000000000
+    9.370000000    0.000000000    0.000000000
+    9.380000000    0.000000000    0.000000000
+    9.390000000    0.000000000    0.000000000
+    9.400000000    0.000000000    0.000000000
+    9.410000000    0.000000000    0.000000000
+    9.420000000    0.000000000    0.000000000
+    9.430000000    0.000000000    0.000000000
+    9.440000000    0.000000000    0.000000000
+    9.450000000    0.000000000    0.000000000
+    9.460000000    0.000000000    0.000000000
+    9.470000000    0.000000000    0.000000000
+    9.480000000    0.000000000    0.000000000
+    9.490000000    0.000000000    0.000000000
+    9.500000000    0.000000000    0.000000000
+    9.510000000    0.000000000    0.000000000
+    9.520000000    0.000000000    0.000000000
+    9.530000000    0.000000000    0.000000000
+    9.540000000    0.000000000    0.000000000
+    9.550000000    0.000000000    0.000000000
+    9.560000000    0.000000000    0.000000000
+    9.570000000    0.000000000    0.000000000
+    9.580000000    0.000000000    0.000000000
+    9.590000000    0.000000000    0.000000000
+    9.600000000    0.000000000    0.000000000
+    9.610000000    0.000000000    0.000000000
+    9.620000000    0.000000000    0.000000000
+    9.630000000    0.000000000    0.000000000
+    9.640000000    0.000000000    0.000000000
+    9.650000000    0.000000000    0.000000000
+    9.660000000    0.000000000    0.000000000
+    9.670000000    0.000000000    0.000000000
+    9.680000000    0.000000000    0.000000000
+    9.690000000    0.000000000    0.000000000
+    9.700000000    0.000000000    0.000000000
+    9.710000000    0.000000000    0.000000000
+    9.720000000    0.000000000    0.000000000
+    9.730000000    0.000000000    0.000000000
+    9.740000000    0.000000000    0.000000000
+    9.750000000    0.000000000    0.000000000
+    9.760000000    0.000000000    0.000000000
+    9.770000000    0.000000000    0.000000000
+    9.780000000    0.000000000    0.000000000
+    9.790000000    0.000000000    0.000000000
+    9.800000000    0.000000000    0.000000000
+    9.810000000    0.000000000    0.000000000
+    9.820000000    0.000000000    0.000000000
+    9.830000000    0.000000000    0.000000000
+    9.840000000    0.000000000    0.000000000
+    9.850000000    0.000000000    0.000000000
+    9.860000000    0.000000000    0.000000000
+    9.870000000    0.000000000    0.000000000
+    9.880000000    0.000000000    0.000000000
+    9.890000000    0.000000000    0.000000000
+    9.900000000    0.000000000    0.000000000
+    9.910000000    0.000000000    0.000000000
+    9.920000000    0.000000000    0.000000000
+    9.930000000    0.000000000    0.000000000
+    9.940000000    0.000000000    0.000000000
+    9.950000000    0.000000000    0.000000000
+    9.960000000    0.000000000    0.000000000
+    9.970000000    0.000000000    0.000000000
+    9.980000000    0.000000000    0.000000000
+    9.990000000    0.000000000    0.000000000
+   10.000000000    0.000000000    0.000000000
diff --git a/regtest/basic/rt11c/grid1.reference b/regtest/basic/rt11c/grid1.reference
new file mode 100644
index 0000000000000000000000000000000000000000..5d131d68ef1dfccf400dc9246e4ead458942da74
--- /dev/null
+++ b/regtest/basic/rt11c/grid1.reference
@@ -0,0 +1,506 @@
+#! FIELDS d2 md.bias der_d2
+#! SET min_d2 0
+#! SET max_d2 10
+#! SET nbins_d2  501
+#! SET periodic_d2 false
+    0.000000000    0.000000000    0.000000000
+    0.020000000    0.000000000    0.000000000
+    0.040000000    0.000000000    0.000000000
+    0.060000000    0.000000000    0.000000000
+    0.080000000    0.000000000    0.000000000
+    0.100000000    0.000000000    0.000000000
+    0.120000000    0.000000000    0.000000000
+    0.140000000    0.000000000    0.000000000
+    0.160000000    0.000000000    0.000000000
+    0.180000000    0.000000000    0.000000000
+    0.200000000    0.000000000    0.000000000
+    0.220000000    0.000000000    0.000000000
+    0.240000000    0.000000000    0.000000000
+    0.260000000    0.000000000    0.000000000
+    0.280000000    0.000000000    0.000000000
+    0.300000000    0.000000000    0.000000000
+    0.320000000    0.000000000    0.000000000
+    0.340000000    0.000000000    0.000000000
+    0.360000000    0.000000000    0.000000000
+    0.380000000    0.000000000    0.000000000
+    0.400000000    0.000000000    0.000000000
+    0.420000000    0.000000000    0.000000000
+    0.440000000    0.000000000    0.000000000
+    0.460000000    0.000000000    0.000000000
+    0.480000000    0.000000000    0.000000000
+    0.500000000    0.001966054    0.034627488
+    0.520000000    0.004731959    0.081689638
+    0.540000000    0.006649983    0.111481099
+    0.560000000    0.009252595    0.150492061
+    0.580000000    0.012745883    0.200946294
+    0.600000000    0.017383603    0.265383634
+    0.620000000    0.023473242    0.346630707
+    0.640000000    0.031381222    0.447740864
+    0.660000000    0.041536518    0.571897057
+    0.680000000    0.054431941    0.722272573
+    0.700000000    0.070622171    0.901846653
+    0.720000000    0.090717657    1.113175188
+    0.740000000    0.115373513    1.358120792
+    0.760000000    0.145272681    1.637551530
+    0.780000000    0.181102875    1.951023062
+    0.800000000    0.223527163    2.296464542
+    0.820000000    0.273148518    2.669893619
+    0.840000000    0.330469197    3.065189693
+    0.860000000    0.395846424    3.473956430
+    0.880000000    0.469446477    3.885503748
+    0.900000000    0.551199825    4.286975572
+    0.920000000    0.640760450    4.663642389
+    0.940000000    0.737472748    4.999367014
+    0.960000000    0.840349446    5.277238718
+    0.980000000    0.948063733    5.480355636
+    1.000000000    1.058958239    5.592719705
+    1.020000000    1.171072626    5.600193738
+    1.040000000    1.282190431    5.491458276
+    1.060000000    1.389904466    5.258898349
+    1.080000000    1.491698650    4.899348297
+    1.100000000    1.585042750    4.414627432
+    1.120000000    1.667495271    3.811810520
+    1.140000000    1.736808753    3.103194404
+    1.160000000    1.791031212    2.305944241
+    1.180000000    1.828597350    1.441427824
+    1.200000000    1.848403593    0.534272050
+    1.220000000    1.849861941   -0.388800942
+    1.240000000    1.832928969   -1.300281272
+    1.260000000    1.798107971   -2.173233242
+    1.280000000    1.746424115   -2.982617558
+    1.300000000    1.679374304   -3.706483316
+    1.320000000    1.598855177   -4.326950629
+    1.340000000    1.507074073   -4.830922992
+    1.360000000    1.406448796   -5.210491430
+    1.380000000    1.299502491   -5.463017906
+    1.400000000    1.188759988   -5.590910702
+    1.420000000    1.076651427   -5.601127250
+    1.440000000    0.965428130   -5.504458060
+    1.460000000    0.857094427   -5.314657665
+    1.480000000    0.753357810   -5.047494098
+    1.500000000    0.655598313   -4.719787522
+    1.520000000    0.564856687   -4.348501852
+    1.540000000    0.481839770   -3.949941871
+    1.560000000    0.406940524   -3.539094026
+    1.580000000    0.340269616   -3.129133426
+    1.600000000    0.281695132   -2.731104226
+    1.620000000    0.230886989   -2.353766901
+    1.640000000    0.187362871   -2.003594850
+    1.660000000    0.150532946   -1.684894948
+    1.680000000    0.119741185   -1.400022249
+    1.700000000    0.094301698   -1.149657762
+    1.720000000    0.073529133   -0.933119729
+    1.740000000    0.056762741   -0.748682335
+    1.760000000    0.043384166   -0.593880695
+    1.780000000    0.032829415   -0.465786451
+    1.800000000    0.024595690   -0.361243884
+    1.820000000    0.018243951   -0.277061473
+    1.840000000    0.013398086   -0.210158152
+    1.860000000    0.009741608   -0.157666795
+    1.880000000    0.007012656   -0.116999737
+    1.900000000    0.004998025   -0.085882476
+    1.920000000    0.001907046   -0.033392326
+    1.940000000    0.000000000    0.000000000
+    1.960000000    0.000000000    0.000000000
+    1.980000000    0.000000000    0.000000000
+    2.000000000    0.000000000    0.000000000
+    2.020000000    0.000000000    0.000000000
+    2.040000000    0.000000000    0.000000000
+    2.060000000    0.000000000    0.000000000
+    2.080000000    0.000000000    0.000000000
+    2.100000000    0.000000000    0.000000000
+    2.120000000    0.000000000    0.000000000
+    2.140000000    0.000000000    0.000000000
+    2.160000000    0.000000000    0.000000000
+    2.180000000    0.000000000    0.000000000
+    2.200000000    0.000000000    0.000000000
+    2.220000000    0.000000000    0.000000000
+    2.240000000    0.000000000    0.000000000
+    2.260000000    0.000000000    0.000000000
+    2.280000000    0.000000000    0.000000000
+    2.300000000    0.000000000    0.000000000
+    2.320000000    0.000000000    0.000000000
+    2.340000000    0.000000000    0.000000000
+    2.360000000    0.000000000    0.000000000
+    2.380000000    0.000000000    0.000000000
+    2.400000000    0.000000000    0.000000000
+    2.420000000    0.000000000    0.000000000
+    2.440000000    0.000000000    0.000000000
+    2.460000000    0.000000000    0.000000000
+    2.480000000    0.000000000    0.000000000
+    2.500000000    0.000000000    0.000000000
+    2.520000000    0.000000000    0.000000000
+    2.540000000    0.000000000    0.000000000
+    2.560000000    0.000000000    0.000000000
+    2.580000000    0.000000000    0.000000000
+    2.600000000    0.000000000    0.000000000
+    2.620000000    0.000000000    0.000000000
+    2.640000000    0.000000000    0.000000000
+    2.660000000    0.000000000    0.000000000
+    2.680000000    0.000000000    0.000000000
+    2.700000000    0.000000000    0.000000000
+    2.720000000    0.000000000    0.000000000
+    2.740000000    0.000000000    0.000000000
+    2.760000000    0.000000000    0.000000000
+    2.780000000    0.000000000    0.000000000
+    2.800000000    0.000000000    0.000000000
+    2.820000000    0.000000000    0.000000000
+    2.840000000    0.000000000    0.000000000
+    2.860000000    0.000000000    0.000000000
+    2.880000000    0.000000000    0.000000000
+    2.900000000    0.000000000    0.000000000
+    2.920000000    0.000000000    0.000000000
+    2.940000000    0.000000000    0.000000000
+    2.960000000    0.000000000    0.000000000
+    2.980000000    0.000000000    0.000000000
+    3.000000000    0.000000000    0.000000000
+    3.020000000    0.000000000    0.000000000
+    3.040000000    0.000000000    0.000000000
+    3.060000000    0.000000000    0.000000000
+    3.080000000    0.000000000    0.000000000
+    3.100000000    0.000000000    0.000000000
+    3.120000000    0.000000000    0.000000000
+    3.140000000    0.000000000    0.000000000
+    3.160000000    0.000000000    0.000000000
+    3.180000000    0.000000000    0.000000000
+    3.200000000    0.000000000    0.000000000
+    3.220000000    0.000000000    0.000000000
+    3.240000000    0.000000000    0.000000000
+    3.260000000    0.000000000    0.000000000
+    3.280000000    0.000000000    0.000000000
+    3.300000000    0.000000000    0.000000000
+    3.320000000    0.000000000    0.000000000
+    3.340000000    0.000000000    0.000000000
+    3.360000000    0.000000000    0.000000000
+    3.380000000    0.000000000    0.000000000
+    3.400000000    0.000000000    0.000000000
+    3.420000000    0.000000000    0.000000000
+    3.440000000    0.000000000    0.000000000
+    3.460000000    0.000000000    0.000000000
+    3.480000000    0.000000000    0.000000000
+    3.500000000    0.000000000    0.000000000
+    3.520000000    0.000000000    0.000000000
+    3.540000000    0.000000000    0.000000000
+    3.560000000    0.000000000    0.000000000
+    3.580000000    0.000000000    0.000000000
+    3.600000000    0.000000000    0.000000000
+    3.620000000    0.000000000    0.000000000
+    3.640000000    0.000000000    0.000000000
+    3.660000000    0.000000000    0.000000000
+    3.680000000    0.000000000    0.000000000
+    3.700000000    0.000000000    0.000000000
+    3.720000000    0.000000000    0.000000000
+    3.740000000    0.000000000    0.000000000
+    3.760000000    0.000000000    0.000000000
+    3.780000000    0.000000000    0.000000000
+    3.800000000    0.000000000    0.000000000
+    3.820000000    0.000000000    0.000000000
+    3.840000000    0.000000000    0.000000000
+    3.860000000    0.000000000    0.000000000
+    3.880000000    0.000000000    0.000000000
+    3.900000000    0.000000000    0.000000000
+    3.920000000    0.000000000    0.000000000
+    3.940000000    0.000000000    0.000000000
+    3.960000000    0.000000000    0.000000000
+    3.980000000    0.000000000    0.000000000
+    4.000000000    0.000000000    0.000000000
+    4.020000000    0.000000000    0.000000000
+    4.040000000    0.000000000    0.000000000
+    4.060000000    0.000000000    0.000000000
+    4.080000000    0.000000000    0.000000000
+    4.100000000    0.000000000    0.000000000
+    4.120000000    0.000000000    0.000000000
+    4.140000000    0.000000000    0.000000000
+    4.160000000    0.000000000    0.000000000
+    4.180000000    0.000000000    0.000000000
+    4.200000000    0.000000000    0.000000000
+    4.220000000    0.000000000    0.000000000
+    4.240000000    0.000000000    0.000000000
+    4.260000000    0.000000000    0.000000000
+    4.280000000    0.000000000    0.000000000
+    4.300000000    0.000000000    0.000000000
+    4.320000000    0.000000000    0.000000000
+    4.340000000    0.000000000    0.000000000
+    4.360000000    0.000000000    0.000000000
+    4.380000000    0.000000000    0.000000000
+    4.400000000    0.000000000    0.000000000
+    4.420000000    0.000000000    0.000000000
+    4.440000000    0.000000000    0.000000000
+    4.460000000    0.000000000    0.000000000
+    4.480000000    0.000000000    0.000000000
+    4.500000000    0.000000000    0.000000000
+    4.520000000    0.000000000    0.000000000
+    4.540000000    0.000000000    0.000000000
+    4.560000000    0.000000000    0.000000000
+    4.580000000    0.000000000    0.000000000
+    4.600000000    0.000000000    0.000000000
+    4.620000000    0.000000000    0.000000000
+    4.640000000    0.000000000    0.000000000
+    4.660000000    0.000000000    0.000000000
+    4.680000000    0.000000000    0.000000000
+    4.700000000    0.000000000    0.000000000
+    4.720000000    0.000000000    0.000000000
+    4.740000000    0.000000000    0.000000000
+    4.760000000    0.000000000    0.000000000
+    4.780000000    0.000000000    0.000000000
+    4.800000000    0.000000000    0.000000000
+    4.820000000    0.000000000    0.000000000
+    4.840000000    0.000000000    0.000000000
+    4.860000000    0.000000000    0.000000000
+    4.880000000    0.000000000    0.000000000
+    4.900000000    0.000000000    0.000000000
+    4.920000000    0.000000000    0.000000000
+    4.940000000    0.000000000    0.000000000
+    4.960000000    0.000000000    0.000000000
+    4.980000000    0.000000000    0.000000000
+    5.000000000    0.000000000    0.000000000
+    5.020000000    0.000000000    0.000000000
+    5.040000000    0.000000000    0.000000000
+    5.060000000    0.000000000    0.000000000
+    5.080000000    0.000000000    0.000000000
+    5.100000000    0.000000000    0.000000000
+    5.120000000    0.000000000    0.000000000
+    5.140000000    0.000000000    0.000000000
+    5.160000000    0.000000000    0.000000000
+    5.180000000    0.000000000    0.000000000
+    5.200000000    0.000000000    0.000000000
+    5.220000000    0.000000000    0.000000000
+    5.240000000    0.000000000    0.000000000
+    5.260000000    0.000000000    0.000000000
+    5.280000000    0.000000000    0.000000000
+    5.300000000    0.000000000    0.000000000
+    5.320000000    0.000000000    0.000000000
+    5.340000000    0.000000000    0.000000000
+    5.360000000    0.000000000    0.000000000
+    5.380000000    0.000000000    0.000000000
+    5.400000000    0.000000000    0.000000000
+    5.420000000    0.000000000    0.000000000
+    5.440000000    0.000000000    0.000000000
+    5.460000000    0.000000000    0.000000000
+    5.480000000    0.000000000    0.000000000
+    5.500000000    0.000000000    0.000000000
+    5.520000000    0.000000000    0.000000000
+    5.540000000    0.000000000    0.000000000
+    5.560000000    0.000000000    0.000000000
+    5.580000000    0.000000000    0.000000000
+    5.600000000    0.000000000    0.000000000
+    5.620000000    0.000000000    0.000000000
+    5.640000000    0.000000000    0.000000000
+    5.660000000    0.000000000    0.000000000
+    5.680000000    0.000000000    0.000000000
+    5.700000000    0.000000000    0.000000000
+    5.720000000    0.000000000    0.000000000
+    5.740000000    0.000000000    0.000000000
+    5.760000000    0.000000000    0.000000000
+    5.780000000    0.000000000    0.000000000
+    5.800000000    0.000000000    0.000000000
+    5.820000000    0.000000000    0.000000000
+    5.840000000    0.000000000    0.000000000
+    5.860000000    0.000000000    0.000000000
+    5.880000000    0.000000000    0.000000000
+    5.900000000    0.000000000    0.000000000
+    5.920000000    0.000000000    0.000000000
+    5.940000000    0.000000000    0.000000000
+    5.960000000    0.000000000    0.000000000
+    5.980000000    0.000000000    0.000000000
+    6.000000000    0.000000000    0.000000000
+    6.020000000    0.000000000    0.000000000
+    6.040000000    0.000000000    0.000000000
+    6.060000000    0.000000000    0.000000000
+    6.080000000    0.000000000    0.000000000
+    6.100000000    0.000000000    0.000000000
+    6.120000000    0.000000000    0.000000000
+    6.140000000    0.000000000    0.000000000
+    6.160000000    0.000000000    0.000000000
+    6.180000000    0.000000000    0.000000000
+    6.200000000    0.000000000    0.000000000
+    6.220000000    0.000000000    0.000000000
+    6.240000000    0.000000000    0.000000000
+    6.260000000    0.000000000    0.000000000
+    6.280000000    0.000000000    0.000000000
+    6.300000000    0.000000000    0.000000000
+    6.320000000    0.000000000    0.000000000
+    6.340000000    0.000000000    0.000000000
+    6.360000000    0.000000000    0.000000000
+    6.380000000    0.000000000    0.000000000
+    6.400000000    0.000000000    0.000000000
+    6.420000000    0.000000000    0.000000000
+    6.440000000    0.000000000    0.000000000
+    6.460000000    0.000000000    0.000000000
+    6.480000000    0.000000000    0.000000000
+    6.500000000    0.000000000    0.000000000
+    6.520000000    0.000000000    0.000000000
+    6.540000000    0.000000000    0.000000000
+    6.560000000    0.000000000    0.000000000
+    6.580000000    0.000000000    0.000000000
+    6.600000000    0.000000000    0.000000000
+    6.620000000    0.000000000    0.000000000
+    6.640000000    0.000000000    0.000000000
+    6.660000000    0.000000000    0.000000000
+    6.680000000    0.000000000    0.000000000
+    6.700000000    0.000000000    0.000000000
+    6.720000000    0.000000000    0.000000000
+    6.740000000    0.000000000    0.000000000
+    6.760000000    0.000000000    0.000000000
+    6.780000000    0.000000000    0.000000000
+    6.800000000    0.000000000    0.000000000
+    6.820000000    0.000000000    0.000000000
+    6.840000000    0.000000000    0.000000000
+    6.860000000    0.000000000    0.000000000
+    6.880000000    0.000000000    0.000000000
+    6.900000000    0.000000000    0.000000000
+    6.920000000    0.000000000    0.000000000
+    6.940000000    0.000000000    0.000000000
+    6.960000000    0.000000000    0.000000000
+    6.980000000    0.000000000    0.000000000
+    7.000000000    0.000000000    0.000000000
+    7.020000000    0.000000000    0.000000000
+    7.040000000    0.000000000    0.000000000
+    7.060000000    0.000000000    0.000000000
+    7.080000000    0.000000000    0.000000000
+    7.100000000    0.000000000    0.000000000
+    7.120000000    0.000000000    0.000000000
+    7.140000000    0.000000000    0.000000000
+    7.160000000    0.000000000    0.000000000
+    7.180000000    0.000000000    0.000000000
+    7.200000000    0.000000000    0.000000000
+    7.220000000    0.000000000    0.000000000
+    7.240000000    0.000000000    0.000000000
+    7.260000000    0.000000000    0.000000000
+    7.280000000    0.000000000    0.000000000
+    7.300000000    0.000000000    0.000000000
+    7.320000000    0.000000000    0.000000000
+    7.340000000    0.000000000    0.000000000
+    7.360000000    0.000000000    0.000000000
+    7.380000000    0.000000000    0.000000000
+    7.400000000    0.000000000    0.000000000
+    7.420000000    0.000000000    0.000000000
+    7.440000000    0.000000000    0.000000000
+    7.460000000    0.000000000    0.000000000
+    7.480000000    0.000000000    0.000000000
+    7.500000000    0.000000000    0.000000000
+    7.520000000    0.000000000    0.000000000
+    7.540000000    0.000000000    0.000000000
+    7.560000000    0.000000000    0.000000000
+    7.580000000    0.000000000    0.000000000
+    7.600000000    0.000000000    0.000000000
+    7.620000000    0.000000000    0.000000000
+    7.640000000    0.000000000    0.000000000
+    7.660000000    0.000000000    0.000000000
+    7.680000000    0.000000000    0.000000000
+    7.700000000    0.000000000    0.000000000
+    7.720000000    0.000000000    0.000000000
+    7.740000000    0.000000000    0.000000000
+    7.760000000    0.000000000    0.000000000
+    7.780000000    0.000000000    0.000000000
+    7.800000000    0.000000000    0.000000000
+    7.820000000    0.000000000    0.000000000
+    7.840000000    0.000000000    0.000000000
+    7.860000000    0.000000000    0.000000000
+    7.880000000    0.000000000    0.000000000
+    7.900000000    0.000000000    0.000000000
+    7.920000000    0.000000000    0.000000000
+    7.940000000    0.000000000    0.000000000
+    7.960000000    0.000000000    0.000000000
+    7.980000000    0.000000000    0.000000000
+    8.000000000    0.000000000    0.000000000
+    8.020000000    0.000000000    0.000000000
+    8.040000000    0.000000000    0.000000000
+    8.060000000    0.000000000    0.000000000
+    8.080000000    0.000000000    0.000000000
+    8.100000000    0.000000000    0.000000000
+    8.120000000    0.000000000    0.000000000
+    8.140000000    0.000000000    0.000000000
+    8.160000000    0.000000000    0.000000000
+    8.180000000    0.000000000    0.000000000
+    8.200000000    0.000000000    0.000000000
+    8.220000000    0.000000000    0.000000000
+    8.240000000    0.000000000    0.000000000
+    8.260000000    0.000000000    0.000000000
+    8.280000000    0.000000000    0.000000000
+    8.300000000    0.000000000    0.000000000
+    8.320000000    0.000000000    0.000000000
+    8.340000000    0.000000000    0.000000000
+    8.360000000    0.000000000    0.000000000
+    8.380000000    0.000000000    0.000000000
+    8.400000000    0.000000000    0.000000000
+    8.420000000    0.000000000    0.000000000
+    8.440000000    0.000000000    0.000000000
+    8.460000000    0.000000000    0.000000000
+    8.480000000    0.000000000    0.000000000
+    8.500000000    0.000000000    0.000000000
+    8.520000000    0.000000000    0.000000000
+    8.540000000    0.000000000    0.000000000
+    8.560000000    0.000000000    0.000000000
+    8.580000000    0.000000000    0.000000000
+    8.600000000    0.000000000    0.000000000
+    8.620000000    0.000000000    0.000000000
+    8.640000000    0.000000000    0.000000000
+    8.660000000    0.000000000    0.000000000
+    8.680000000    0.000000000    0.000000000
+    8.700000000    0.000000000    0.000000000
+    8.720000000    0.000000000    0.000000000
+    8.740000000    0.000000000    0.000000000
+    8.760000000    0.000000000    0.000000000
+    8.780000000    0.000000000    0.000000000
+    8.800000000    0.000000000    0.000000000
+    8.820000000    0.000000000    0.000000000
+    8.840000000    0.000000000    0.000000000
+    8.860000000    0.000000000    0.000000000
+    8.880000000    0.000000000    0.000000000
+    8.900000000    0.000000000    0.000000000
+    8.920000000    0.000000000    0.000000000
+    8.940000000    0.000000000    0.000000000
+    8.960000000    0.000000000    0.000000000
+    8.980000000    0.000000000    0.000000000
+    9.000000000    0.000000000    0.000000000
+    9.020000000    0.000000000    0.000000000
+    9.040000000    0.000000000    0.000000000
+    9.060000000    0.000000000    0.000000000
+    9.080000000    0.000000000    0.000000000
+    9.100000000    0.000000000    0.000000000
+    9.120000000    0.000000000    0.000000000
+    9.140000000    0.000000000    0.000000000
+    9.160000000    0.000000000    0.000000000
+    9.180000000    0.000000000    0.000000000
+    9.200000000    0.000000000    0.000000000
+    9.220000000    0.000000000    0.000000000
+    9.240000000    0.000000000    0.000000000
+    9.260000000    0.000000000    0.000000000
+    9.280000000    0.000000000    0.000000000
+    9.300000000    0.000000000    0.000000000
+    9.320000000    0.000000000    0.000000000
+    9.340000000    0.000000000    0.000000000
+    9.360000000    0.000000000    0.000000000
+    9.380000000    0.000000000    0.000000000
+    9.400000000    0.000000000    0.000000000
+    9.420000000    0.000000000    0.000000000
+    9.440000000    0.000000000    0.000000000
+    9.460000000    0.000000000    0.000000000
+    9.480000000    0.000000000    0.000000000
+    9.500000000    0.000000000    0.000000000
+    9.520000000    0.000000000    0.000000000
+    9.540000000    0.000000000    0.000000000
+    9.560000000    0.000000000    0.000000000
+    9.580000000    0.000000000    0.000000000
+    9.600000000    0.000000000    0.000000000
+    9.620000000    0.000000000    0.000000000
+    9.640000000    0.000000000    0.000000000
+    9.660000000    0.000000000    0.000000000
+    9.680000000    0.000000000    0.000000000
+    9.700000000    0.000000000    0.000000000
+    9.720000000    0.000000000    0.000000000
+    9.740000000    0.000000000    0.000000000
+    9.760000000    0.000000000    0.000000000
+    9.780000000    0.000000000    0.000000000
+    9.800000000    0.000000000    0.000000000
+    9.820000000    0.000000000    0.000000000
+    9.840000000    0.000000000    0.000000000
+    9.860000000    0.000000000    0.000000000
+    9.880000000    0.000000000    0.000000000
+    9.900000000    0.000000000    0.000000000
+    9.920000000    0.000000000    0.000000000
+    9.940000000    0.000000000    0.000000000
+    9.960000000    0.000000000    0.000000000
+    9.980000000    0.000000000    0.000000000
+   10.000000000    0.000000000    0.000000000
diff --git a/regtest/basic/rt11c/plumed.dat b/regtest/basic/rt11c/plumed.dat
new file mode 100644
index 0000000000000000000000000000000000000000..c5f75f247c3cae3e16569c8c9b010240e0a06da8
--- /dev/null
+++ b/regtest/basic/rt11c/plumed.dat
@@ -0,0 +1,3 @@
+d1: DISTANCE ATOMS=1,10
+d2: DISTANCE ATOMS=50,40
+md: PBMETAD ARG=d1,d2 SIGMA=0.1,0.2 HEIGHT=1.0 PACE=10 TEMP=300 BIASFACTOR=5 FILE=HILLS.0,HILLS.1 GRID_WFILES=grid0,grid1 GRID_WSTRIDE=10 GRID_MIN=0,0 GRID_MAX=10,10
diff --git a/regtest/basic/rt65/AA.pdb b/regtest/basic/rt65/AA.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..7259a480a8559a8528c6277d155d2ed84ae9bc22
--- /dev/null
+++ b/regtest/basic/rt65/AA.pdb
@@ -0,0 +1,71 @@
+TITLE     Great Red Owns Many ACres of Sand 
+REMARK    THIS IS A SIMULATION BOX
+CRYST1    6.958   11.030   10.714  90.00  90.00  90.00 P 1           1
+MODEL        1
+ATOM      1  H5T   A     1      14.820  -3.670   5.260  1.00  0.00            
+ATOM      2  O5'   A     1      15.280  -2.790   5.160  1.00  0.00            
+ATOM      3  C5'   A     1      16.340  -2.690   6.140  1.00  0.00            
+ATOM      4 1H5'   A     1      17.000  -3.430   5.990  1.00  0.00            
+ATOM      5 2H5'   A     1      15.960  -2.760   7.060  1.00  0.00            
+ATOM      6  C4'   A     1      16.940  -1.350   5.900  1.00  0.00            
+ATOM      7  H4'   A     1      17.680  -1.280   6.580  1.00  0.00            
+ATOM      8  O4'   A     1      17.430  -1.280   4.600  1.00  0.00            
+ATOM      9  C1'   A     1      17.500   0.110   4.240  1.00  0.00            
+ATOM     10  H1'   A     1      18.450   0.400   4.070  1.00  0.00            
+ATOM     11  N9    A     1      16.660   0.250   3.040  1.00  0.00            
+ATOM     12  C8    A     1      15.860  -0.690   2.450  1.00  0.00            
+ATOM     13  H8    A     1      15.740  -1.610   2.820  1.00  0.00            
+ATOM     14  N7    A     1      15.260  -0.280   1.380  1.00  0.00            
+ATOM     15  C5    A     1      15.710   1.020   1.230  1.00  0.00            
+ATOM     16  C6    A     1      15.450   2.020   0.250  1.00  0.00            
+ATOM     17  N6    A     1      14.640   1.860  -0.780  1.00  0.00            
+ATOM     18  H61   A     1      14.160   0.990  -0.910  1.00  0.00            
+ATOM     19  H62   A     1      14.510   2.600  -1.430  1.00  0.00            
+ATOM     20  N1    A     1      16.090   3.170   0.480  1.00  0.00            
+ATOM     21  C2    A     1      16.910   3.400   1.530  1.00  0.00            
+ATOM     22  H2    A     1      17.340   4.300   1.590  1.00  0.00            
+ATOM     23  N3    A     1      17.200   2.530   2.470  1.00  0.00            
+ATOM     24  C4    A     1      16.570   1.360   2.260  1.00  0.00            
+ATOM     25  C3'   A     1      15.940  -0.160   5.930  1.00  0.00            
+ATOM     26  H3'   A     1      15.040  -0.310   5.530  1.00  0.00            
+ATOM     27  C2'   A     1      16.900   0.910   5.350  1.00  0.00            
+ATOM     28 1H2'   A     1      16.600   1.820   5.060  1.00  0.00            
+ATOM     29  O2'   A     1      17.840   1.060   6.410  1.00  0.00            
+ATOM     30 2HO'   A     1      17.420   1.600   7.150  1.00  0.00            
+ATOM     31  O3'   A     1      15.700   0.220   7.290  1.00  0.00            
+ATOM     32  P     A     2      14.280   0.530   7.900  1.00  0.00            
+ATOM     33  O1P   A     2      14.390  -0.010   9.290  1.00  0.00            
+ATOM     34  O2P   A     2      13.160  -0.100   7.150  1.00  0.00            
+ATOM     35  O5'   A     2      14.210   2.130   7.830  1.00  0.00            
+ATOM     36  C5'   A     2      15.380   2.910   7.550  1.00  0.00            
+ATOM     37 1H5'   A     2      15.960   2.390   6.930  1.00  0.00            
+ATOM     38 2H5'   A     2      15.860   3.070   8.410  1.00  0.00            
+ATOM     39  C4'   A     2      15.050   4.200   6.950  1.00  0.00            
+ATOM     40  H4'   A     2      15.630   4.770   7.540  1.00  0.00            
+ATOM     41  O4'   A     2      15.470   4.270   5.630  1.00  0.00            
+ATOM     42  C1'   A     2      14.600   5.120   4.840  1.00  0.00            
+ATOM     43  H1'   A     2      15.060   5.780   4.250  1.00  0.00            
+ATOM     44  N9    A     2      13.880   4.270   3.880  1.00  0.00            
+ATOM     45  C8    A     2      13.540   2.950   4.120  1.00  0.00            
+ATOM     46  H8    A     2      13.720   2.490   4.980  1.00  0.00            
+ATOM     47  N7    A     2      12.960   2.370   3.100  1.00  0.00            
+ATOM     48  C5    A     2      12.920   3.360   2.120  1.00  0.00            
+ATOM     49  C6    A     2      12.390   3.330   0.800  1.00  0.00            
+ATOM     50  N6    A     2      11.840   2.270   0.240  1.00  0.00            
+ATOM     51  H61   A     2      11.780   1.410   0.740  1.00  0.00            
+ATOM     52  H62   A     2      11.490   2.330  -0.690  1.00  0.00            
+ATOM     53  N1    A     2      12.470   4.520   0.120  1.00  0.00            
+ATOM     54  C2    A     2      13.040   5.620   0.710  1.00  0.00            
+ATOM     55  H2    A     2      13.080   6.450   0.160  1.00  0.00            
+ATOM     56  N3    A     2      13.560   5.710   1.940  1.00  0.00            
+ATOM     57  C4    A     2      13.470   4.520   2.580  1.00  0.00            
+ATOM     58  C3'   A     2      13.660   4.790   6.980  1.00  0.00            
+ATOM     59  H3'   A     2      12.960   4.060   6.950  1.00  0.00            
+ATOM     60  C2'   A     2      13.760   5.840   5.850  1.00  0.00            
+ATOM     61 1H2'   A     2      12.940   6.250   5.450  1.00  0.00            
+ATOM     62  O2'   A     2      14.540   6.880   6.470  1.00  0.00            
+ATOM     63 2HO'   A     2      13.980   7.360   7.150  1.00  0.00            
+ATOM     64  O3'   A     2      13.300   5.500   8.180  1.00  0.00            
+ATOM     65  H3T   A     2      12.360   5.840   8.100  1.00  0.00            
+TER
+ENDMDL
diff --git a/regtest/basic/rt65/Makefile b/regtest/basic/rt65/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3703b27cea227aa053fb6d1d73f861e4384dbcee
--- /dev/null
+++ b/regtest/basic/rt65/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/basic/rt65/TARGET.reference b/regtest/basic/rt65/TARGET.reference
new file mode 100644
index 0000000000000000000000000000000000000000..95aa8528f8135c610d275207b44d6300870a7fc6
--- /dev/null
+++ b/regtest/basic/rt65/TARGET.reference
@@ -0,0 +1,13 @@
+#! FIELDS time e1 sigma_e1 height biasf
+#! SET multivariate false
+#! SET min_e1 -pi
+#! SET max_e1 pi
+                  0.002     -1.903205551688819                   0.15  9.109750009330055e-06                      1
+                  0.004      -2.96314509497333                   0.15  1.185709146847737e-05                      1
+                  0.006     -2.563611512401551                   0.15  3.459682279642131e-06                      1
+                  0.008     -3.120966152841994                   0.15  2.627063088737373e-05                      1
+                   0.01     -2.640309461910613                   0.15  3.857669512410835e-06                      1
+                  0.012     -2.981598623047649                   0.15   1.18570902777194e-05                      1
+                  0.014      -1.30629526983071                   0.15  2.684491613922633e-05                      1
+                  0.016     -2.503951571795555                   0.15  3.399120208955464e-06                      1
+                  0.018     -3.014247277468189                   0.15  1.387646340580046e-05                      1
diff --git a/regtest/basic/rt65/config b/regtest/basic/rt65/config
new file mode 100644
index 0000000000000000000000000000000000000000..bd5f5c17c4a1d494bff6b6c49556ee0fb2c6f792
--- /dev/null
+++ b/regtest/basic/rt65/config
@@ -0,0 +1,2 @@
+type=driver
+arg="--plumed plumed.dat --trajectory-stride 1 --timestep 0.002 --ixyz trajectory.xyz"
diff --git a/regtest/basic/rt65/plumed.dat b/regtest/basic/rt65/plumed.dat
new file mode 100644
index 0000000000000000000000000000000000000000..6365032dc62d4fff4c1c610882c68db40f51791b
--- /dev/null
+++ b/regtest/basic/rt65/plumed.dat
@@ -0,0 +1,5 @@
+MOLINFO STRUCTURE=AA.pdb  MOLTYPE=rna
+
+e1: TORSION ATOMS=@epsilon-1
+
+t: METAD ARG=e1 SIGMA=0.15 PACE=1 TAU=200 DAMPFACTOR=100 TARGET=tgt_e1 GRID_MIN=-pi GRID_MAX=pi GRID_BIN=200 FILE=TARGET TEMP=300
diff --git a/regtest/basic/rt65/tgt_e1 b/regtest/basic/rt65/tgt_e1
new file mode 100644
index 0000000000000000000000000000000000000000..c625de5ce34191dc56db5127735f22c1e40acad2
--- /dev/null
+++ b/regtest/basic/rt65/tgt_e1
@@ -0,0 +1,205 @@
+#! FIELDS e1 t.target der_e1
+#! SET min_e1 -pi
+#! SET max_e1 pi
+#! SET nbins_e1  200
+#! SET periodic_e1 true
+ -3.14159 5.10076 -12.70892
+ -3.11018 4.70239 -12.67962
+ -3.07876 4.30407 -12.68027
+ -3.04734 3.90566 -12.65820
+ -3.01593 3.50873 -12.56070
+ -2.98451 3.11645 -12.34584
+ -2.95310 2.73302 -11.98905
+ -2.92168 2.36315 -11.48303
+ -2.89026 2.01152 -10.83462
+ -2.85885 1.68239 -10.06301
+ -2.82743 1.37924 -9.19536
+ -2.79602 1.10463 -8.25853
+ -2.76460 0.86035 -7.27827
+ -2.73319 0.64733 -6.27846
+ -2.70177 0.46586 -5.27871
+ -2.67035 0.31565 -4.29520
+ -2.63894 0.19598 -3.33913
+ -2.60752 0.10585 -2.41815
+ -2.57611 0.04405 -1.53788
+ -2.54469 0.00922 -0.70101
+ -2.51327 0.00000 0.09104
+ -2.48186 0.01494 0.83725
+ -2.45044 0.05261 1.53656
+ -2.41903 0.11149 2.18707
+ -2.38761 0.19002 2.78648
+ -2.35619 0.28657 3.33221
+ -2.32478 0.39939 3.82147
+ -2.29336 0.52667 4.25183
+ -2.26195 0.66654 4.62342
+ -2.23053 0.81717 4.93553
+ -2.19911 0.97665 5.18839
+ -2.16770 1.14317 5.38294
+ -2.13628 1.31487 5.51695
+ -2.10487 1.48981 5.58462
+ -2.07345 1.66576 5.57692
+ -2.04203 1.84022 5.48097
+ -2.01062 2.01014 5.28380
+ -1.97920 2.17221 4.97850
+ -1.94779 2.32295 4.56436
+ -1.91637 2.45899 4.05604
+ -1.88496 2.57780 3.48558
+ -1.85354 2.67800 2.90046
+ -1.82212 2.76004 2.35978
+ -1.79071 2.82627 1.92248
+ -1.75929 2.88083 1.64450
+ -1.72788 2.92960 1.56915
+ -1.69646 2.97943 1.71195
+ -1.66504 3.03716 2.07025
+ -1.63363 3.10950 2.62387
+ -1.60221 3.20203 3.33816
+ -1.57080 3.31925 4.17143
+ -1.53938 3.46413 5.07054
+ -1.50796 3.63784 5.97985
+ -1.47655 3.83984 6.84656
+ -1.44513 4.06801 7.62050
+ -1.41372 4.31866 8.25743
+ -1.38230 4.58685 8.73061
+ -1.35088 4.86722 9.03758
+ -1.31947 5.15470 9.20074
+ -1.28805 5.44532 9.27486
+ -1.25664 5.73745 9.33973
+ -1.22522 6.03215 9.47554
+ -1.19381 6.33282 9.75711
+ -1.16239 6.64521 10.22904
+ -1.13097 6.97553 10.90943
+ -1.09956 7.33067 11.77148
+ -1.06814 7.71514 12.75565
+ -1.03673 8.13212 13.79332
+ -1.00531 8.58181 14.77995
+ -0.97389 9.06078 15.61190
+ -0.94248 9.56273 16.16983
+ -0.91106 10.07676 16.32518
+ -0.87965 10.58848 16.02464
+ -0.84823 11.08362 15.28715
+ -0.81681 11.54900 14.17647
+ -0.78540 11.97436 12.86211
+ -0.75398 12.35715 11.57633
+ -0.72257 12.70172 10.55300
+ -0.69115 13.02022 9.88103
+ -0.65973 13.32257 9.50496
+ -0.62832 13.61742 9.40240
+ -0.59690 13.91333 9.43842
+ -0.56549 14.21046 9.44831
+ -0.53407 14.50698 9.34280
+ -0.50265 14.79749 9.04752
+ -0.47124 15.07546 8.52664
+ -0.43982 15.33323 7.74502
+ -0.40841 15.56209 6.80145
+ -0.37699 15.76058 5.80952
+ -0.34558 15.92712 4.90263
+ -0.31416 16.06862 4.15626
+ -0.28274 16.18826 3.55378
+ -0.25133 16.29191 2.98214
+ -0.21991 16.37564 2.29118
+ -0.18850 16.43587 1.31572
+ -0.15708 16.45830 -0.09998
+ -0.12566 16.42959 -1.82144
+ -0.09425 16.34386 -3.60730
+ -0.06283 16.20294 -5.18707
+ -0.03142 16.01795 -6.30761
+ 0.00000 15.80662 -6.83201
+ 0.03142 15.58868 -6.76985
+ 0.06283 15.38125 -6.23660
+ 0.09425 15.19682 -5.36388
+ 0.12566 15.04423 -4.28282
+ 0.15708 14.92772 -3.11155
+ 0.18850 14.84872 -1.93712
+ 0.21991 14.80601 -0.84119
+ 0.25133 14.79587 0.12393
+ 0.28274 14.81380 0.87287
+ 0.31416 14.85071 1.34688
+ 0.34558 14.89842 1.51588
+ 0.37699 14.94596 1.30661
+ 0.40841 14.98052 0.68187
+ 0.43982 14.98880 -0.32791
+ 0.47124 14.95992 -1.69025
+ 0.50265 14.88260 -3.29052
+ 0.53407 14.75317 -4.96947
+ 0.56549 14.57036 -6.58050
+ 0.59690 14.33970 -7.94522
+ 0.62832 14.07115 -8.95276
+ 0.65973 13.77719 -9.54424
+ 0.69115 13.47147 -9.71559
+ 0.72257 13.16674 -9.52214
+ 0.75398 12.87318 -9.04455
+ 0.78540 12.59845 -8.35746
+ 0.81681 12.34806 -7.53288
+ 0.84823 12.12515 -6.63737
+ 0.87965 11.93102 -5.70555
+ 0.91106 11.76666 -4.77067
+ 0.94248 11.63127 -3.85511
+ 0.97389 11.52443 -2.96150
+ 1.00531 11.44519 -2.10886
+ 1.03673 11.39193 -1.30973
+ 1.06814 11.36290 -0.57795
+ 1.09956 11.35562 0.06564
+ 1.13097 11.36702 0.60908
+ 1.16239 11.39388 1.04956
+ 1.19381 11.43297 1.39725
+ 1.22522 11.48168 1.67383
+ 1.25664 11.53814 1.91235
+ 1.28805 11.60183 2.15400
+ 1.31947 11.67348 2.43040
+ 1.35088 11.75454 2.78076
+ 1.38230 11.84820 3.21639
+ 1.41372 11.95663 3.73112
+ 1.44513 12.08264 4.31252
+ 1.47655 12.22760 4.93997
+ 1.50796 12.39302 5.59522
+ 1.53938 12.57915 6.22538
+ 1.57080 12.78417 6.81465
+ 1.60221 13.00733 7.31439
+ 1.63363 13.24375 7.69216
+ 1.66504 13.49064 7.92523
+ 1.69646 13.74171 7.96631
+ 1.72788 13.99118 7.82705
+ 1.75929 14.23350 7.52661
+ 1.79071 14.46409 7.07963
+ 1.82212 14.67832 6.56183
+ 1.85354 14.87639 6.02293
+ 1.88496 15.05676 5.48870
+ 1.91637 15.22125 4.97648
+ 1.94779 15.36943 4.47050
+ 1.97920 15.50214 3.93312
+ 2.01062 15.61656 3.25777
+ 2.04203 15.70683 2.41601
+ 2.07345 15.76836 1.35324
+ 2.10487 15.79186 0.07937
+ 2.13628 15.77335 -1.32078
+ 2.16770 15.70887 -2.78132
+ 2.19911 15.59859 -4.15591
+ 2.23053 15.44775 -5.35838
+ 2.26195 15.26192 -6.29576
+ 2.29336 15.05217 -6.91512
+ 2.32478 14.82742 -7.29068
+ 2.35619 14.59409 -7.47898
+ 2.38761 14.35751 -7.52943
+ 2.41903 14.12100 -7.53571
+ 2.45044 13.88403 -7.59844
+ 2.48186 13.64358 -7.73822
+ 2.51327 13.39782 -7.97864
+ 2.54469 13.14226 -8.41694
+ 2.57611 12.86897 -9.08704
+ 2.60752 12.57131 -9.95794
+ 2.63894 12.24329 -11.01270
+ 2.67035 11.87936 -12.16703
+ 2.70177 11.47881 -13.30580
+ 2.73319 11.04332 -14.32339
+ 2.76460 10.57884 -15.12139
+ 2.79602 10.09323 -15.64548
+ 2.82743 9.59582 -15.88386
+ 2.85885 9.09522 -15.86574
+ 2.89026 8.59895 -15.63372
+ 2.92168 8.11292 -15.24911
+ 2.95310 7.64082 -14.77260
+ 2.98451 7.18473 -14.26221
+ 3.01593 6.74469 -13.77231
+ 3.04734 6.31939 -13.34976
+ 3.07876 5.90590 -13.02634
+ 3.11018 5.50091 -12.81419
diff --git a/regtest/basic/rt65/trajectory.xyz b/regtest/basic/rt65/trajectory.xyz
new file mode 100644
index 0000000000000000000000000000000000000000..efd55fc9798338c494ff980f338bdc8bdf9748ad
--- /dev/null
+++ b/regtest/basic/rt65/trajectory.xyz
@@ -0,0 +1,670 @@
+65
+ generated by VMD
+  H5T       14.847420        1.772776        4.201096
+  O5'       14.896201        1.694230        4.226919
+  C5'       14.992702        1.713513        4.336967
+ 1H5'       15.043725        1.617648        4.346333
+ 2H5'       14.931581        1.744887        4.421588
+  C4'       15.109325        1.815422        4.313169
+  H4'       15.187816        1.806181        4.388236
+  O4'       15.167566        1.808275        4.180243
+  C1'       15.187524        1.937432        4.125737
+  H1'       15.288992        1.958157        4.091741
+  N9        15.113276        1.952933        3.995719
+  C8        15.022908        1.873954        3.928178
+  H8        14.982371        1.781079        3.965527
+  N7        14.971737        1.930705        3.820266
+  C5        15.029648        2.053877        3.819370
+  C6        15.025768        2.167929        3.734162
+  N6        14.939121        2.182113        3.633917
+  H61       14.914086        2.100017        3.580677
+  H62       14.937536        2.270351        3.584798
+  N1        15.103413        2.275954        3.754111
+  C2        15.182701        2.273724        3.855718
+  H2        15.259680        2.349052        3.863716
+  N3        15.187340        2.182850        3.952467
+  C4        15.120228        2.070090        3.922029
+  C3'       15.035816        1.958286        4.311849
+  H3'       14.939285        1.948002        4.262280
+  C2'       15.148665        2.036268        4.234992
+ 1H2'       15.110026        2.132472        4.201334
+  O2'       15.260274        2.057905        4.323319
+ 2HO'       15.230085        2.128216        4.381293
+  O3'       15.023906        1.998978        4.444931
+  P         14.882421        2.034298        4.503669
+  O1P       14.885519        2.029100        4.653514
+  O2P       14.778923        1.947499        4.435366
+  O5'       14.872825        2.189016        4.459605
+  C5'       14.940541        2.299417        4.518781
+ 1H5'       15.049508        2.297146        4.517267
+ 2H5'       14.923553        2.312758        4.625619
+  C4'       14.913382        2.437655        4.441618
+  H4'       14.960114        2.516044        4.501219
+  O4'       14.991676        2.446961        4.318547
+  C1'       14.894224        2.496199        4.223132
+  H1'       14.937422        2.573962        4.160140
+  N9        14.843146        2.395506        4.128749
+  C8        14.852216        2.257627        4.142356
+  H8        14.904563        2.207837        4.222633
+  N7        14.785973        2.194726        4.048380
+  C5        14.742547        2.293209        3.962341
+  C6        14.664241        2.302546        3.850103
+  N6        14.625215        2.203424        3.779305
+  H61       14.641452        2.107252        3.805538
+  H62       14.581784        2.236622        3.694378
+  N1        14.626487        2.422118        3.794508
+  C2        14.678065        2.530943        3.851064
+  H2        14.648037        2.626358        3.810341
+  N3        14.751561        2.538620        3.962151
+  C4        14.777925        2.415613        4.010852
+  C3'       14.765208        2.460481        4.413977
+  H3'       14.716369        2.371144        4.375062
+  C2'       14.776618        2.561126        4.300804
+ 1H2'       14.684395        2.563750        4.242760
+  O2'       14.824705        2.695651        4.325369
+ 2HO'       14.799959        2.754887        4.253991
+  O3'       14.698467        2.513825        4.521744
+  H3T       14.609154        2.545954        4.507367
+65
+ generated by VMD
+  H5T       14.797546        1.832525        4.381888
+  O5'       14.806289        1.752446        4.329672
+  C5'       14.901293        1.757977        4.228772
+ 1H5'       14.866888        1.819771        4.145833
+ 2H5'       14.918494        1.657468        4.190259
+  C4'       15.033164        1.817925        4.277514
+  H4'       15.069221        1.754881        4.358793
+  O4'       15.126022        1.812059        4.162668
+  C1'       15.183326        1.943732        4.138662
+  H1'       15.291444        1.942941        4.124839
+  N9        15.113312        2.005918        4.016412
+  C8        15.045125        1.944269        3.916965
+  H8        15.032977        1.837563        3.928374
+  N7        14.988688        2.016555        3.824484
+  C5        15.039663        2.141150        3.857107
+  C6        15.037766        2.266198        3.798166
+  N6        14.971131        2.300894        3.689602
+  H61       14.896909        2.239116        3.660015
+  H62       14.976900        2.397799        3.661726
+  N1        15.099638        2.373748        3.845444
+  C2        15.175701        2.353911        3.952780
+  H2        15.221227        2.444624        3.989691
+  N3        15.194207        2.237123        4.019596
+  C4        15.114303        2.136575        3.968732
+  C3'       15.035360        1.964626        4.320169
+  H3'       14.954812        2.020132        4.272085
+  C2'       15.161895        2.026635        4.264152
+ 1H2'       15.151196        2.132333        4.239770
+  O2'       15.274509        1.995500        4.336793
+ 2HO'       15.286169        2.077091        4.386017
+  O3'       15.036485        1.982491        4.461981
+  P         14.913964        2.036578        4.537579
+  O1P       14.941110        2.042828        4.677788
+  O2P       14.806670        1.942226        4.497838
+  O5'       14.878706        2.181873        4.480193
+  C5'       14.973777        2.282000        4.497470
+ 1H5'       15.074605        2.240984        4.491771
+ 2H5'       14.957283        2.328591        4.594621
+  C4'       14.960764        2.394531        4.390433
+  H4'       15.028895        2.476485        4.413289
+  O4'       14.989053        2.357359        4.260358
+  C1'       14.904777        2.425674        4.175336
+  H1'       14.968537        2.474184        4.101429
+  N9        14.813373        2.335255        4.105164
+  C8        14.774955        2.208137        4.129881
+  H8        14.794060        2.151025        4.219532
+  N7        14.713981        2.147487        4.031319
+  C5        14.708177        2.242913        3.936696
+  C6        14.646914        2.245469        3.808760
+  N6        14.571647        2.156780        3.749764
+  H61       14.542524        2.073044        3.798148
+  H62       14.520075        2.188365        3.668870
+  N1        14.641324        2.362669        3.741292
+  C2        14.701813        2.466049        3.798936
+  H2        14.706003        2.556145        3.739528
+  N3        14.766210        2.474183        3.914001
+  C4        14.754867        2.359369        3.985221
+  C3'       14.813626        2.453006        4.397973
+  H3'       14.738989        2.373738        4.392804
+  C2'       14.815772        2.521017        4.262980
+ 1H2'       14.713857        2.519253        4.224363
+  O2'       14.874209        2.646233        4.260289
+ 2HO'       14.833235        2.714067        4.314472
+  O3'       14.786455        2.538643        4.511051
+  H3T       14.695704        2.525192        4.539330
+65
+ generated by VMD
+  H5T       15.097142        1.668326        4.109947
+  O5'       15.046495        1.665248        4.191442
+  C5'       15.124683        1.723198        4.293117
+ 1H5'       15.228861        1.691790        4.299565
+ 2H5'       15.079405        1.705176        4.390616
+  C4'       15.129340        1.872468        4.275512
+  H4'       15.210347        1.907524        4.339464
+  O4'       15.177201        1.902492        4.148199
+  C1'       15.117216        2.023489        4.100878
+  H1'       15.194286        2.099443        4.087759
+  N9        15.043651        2.020701        3.973052
+  C8        14.951991        1.931444        3.935999
+  H8        14.914091        1.855570        4.002862
+  N7        14.922978        1.937139        3.812580
+  C5        15.002272        2.045211        3.763760
+  C6        15.012959        2.114496        3.641779
+  N6        14.968732        2.067341        3.526960
+  H61       14.917503        1.980394        3.531065
+  H62       15.008677        2.109936        3.444552
+  N1        15.095745        2.211320        3.623265
+  C2        15.165342        2.253631        3.734057
+  H2        15.232390        2.335036        3.710782
+  N3        15.165565        2.198909        3.857847
+  C4        15.074628        2.093823        3.866636
+  C3'       15.002029        1.945333        4.306546
+  H3'       14.916491        1.879201        4.292741
+  C2'       15.017333        2.071405        4.210642
+ 1H2'       14.918507        2.098437        4.173444
+  O2'       15.087488        2.175789        4.271819
+ 2HO'       15.026233        2.213833        4.335194
+  O3'       14.990348        1.980879        4.446596
+  P         14.857903        2.037183        4.513322
+  O1P       14.877825        2.029214        4.656148
+  O2P       14.744095        1.961902        4.460282
+  O5'       14.852118        2.185948        4.469699
+  C5'       14.933087        2.285940        4.539681
+ 1H5'       15.038389        2.257839        4.541404
+ 2H5'       14.902606        2.281691        4.644247
+  C4'       14.931292        2.435342        4.486126
+  H4'       14.989725        2.502450        4.549078
+  O4'       14.968204        2.445903        4.352262
+  C1'       14.856408        2.502898        4.277845
+  H1'       14.897898        2.580996        4.214125
+  N9        14.795577        2.400006        4.190372
+  C8        14.714306        2.291367        4.222795
+  H8        14.663243        2.271842        4.315938
+  N7        14.668910        2.232421        4.112843
+  C5        14.746554        2.287563        4.010130
+  C6        14.754593        2.257437        3.872097
+  N6        14.685724        2.157914        3.813751
+  H61       14.630732        2.100684        3.876213
+  H62       14.707831        2.134947        3.717914
+  N1        14.837722        2.329980        3.800696
+  C2        14.913857        2.422629        3.854674
+  H2        14.974173        2.482054        3.787631
+  N3        14.916680        2.462957        3.981540
+  C4        14.825435        2.390300        4.058403
+  C3'       14.781671        2.480878        4.496495
+  H3'       14.724883        2.388146        4.488949
+  C2'       14.764706        2.572741        4.378519
+ 1H2'       14.664557        2.589872        4.339051
+  O2'       14.821342        2.697761        4.413458
+ 2HO'       14.797153        2.759951        4.344442
+  O3'       14.746025        2.547402        4.618172
+  H3T       14.761264        2.641381        4.605847
+65
+ generated by VMD
+  H5T       14.896517        1.722039        4.256091
+  O5'       14.974108        1.684743        4.213608
+  C5'       15.073586        1.678005        4.311006
+ 1H5'       15.153643        1.606393        4.292474
+ 2H5'       15.034431        1.655961        4.410314
+  C4'       15.144793        1.821773        4.321304
+  H4'       15.203806        1.826254        4.412838
+  O4'       15.215021        1.849773        4.200249
+  C1'       15.197684        1.988282        4.162799
+  H1'       15.298632        2.026277        4.147079
+  N9        15.109707        1.993706        4.034468
+  C8        15.059310        1.889387        3.965481
+  H8        15.045170        1.791467        4.008790
+  N7        15.010215        1.916534        3.847403
+  C5        15.035903        2.048973        3.829019
+  C6        15.011794        2.141888        3.726766
+  N6        14.947181        2.125660        3.618889
+  H61       14.891392        2.042211        3.607724
+  H62       14.907223        2.203487        3.568419
+  N1        15.041734        2.272362        3.741960
+  C2        15.094986        2.307462        3.863274
+  H2        15.129761        2.407641        3.883739
+  N3        15.128124        2.226311        3.965864
+  C4        15.096125        2.097537        3.941687
+  C3'       15.044984        1.935202        4.337620
+  H3'       14.957216        1.921462        4.274460
+  C2'       15.120823        2.054060        4.281669
+ 1H2'       15.057999        2.132215        4.238936
+  O2'       15.215878        2.101759        4.370345
+ 2HO'       15.190849        2.057137        4.451575
+  O3'       15.012177        1.959000        4.471820
+  P         14.865489        2.009048        4.514957
+  O1P       14.856316        2.014251        4.664734
+  O2P       14.761842        1.937513        4.440003
+  O5'       14.857528        2.160836        4.464005
+  C5'       14.936448        2.263116        4.522058
+ 1H5'       15.043159        2.254232        4.542422
+ 2H5'       14.909718        2.284791        4.625481
+  C4'       14.933058        2.391823        4.437636
+  H4'       15.002481        2.461632        4.484410
+  O4'       14.965080        2.359071        4.306986
+  C1'       14.914851        2.449100        4.208991
+  H1'       14.991192        2.515312        4.168135
+  N9        14.842881        2.366482        4.105685
+  C8        14.784214        2.243840        4.124059
+  H8        14.818416        2.176592        4.201338
+  N7        14.701168        2.202450        4.028914
+  C5        14.690821        2.318115        3.943541
+  C6        14.620112        2.362557        3.829230
+  N6        14.531917        2.297515        3.759082
+  H61       14.494678        2.206164        3.780740
+  H62       14.508058        2.343673        3.672473
+  N1        14.654975        2.467176        3.760733
+  C2        14.757774        2.538561        3.802464
+  H2        14.788578        2.614861        3.732509
+  N3        14.823767        2.523345        3.918492
+  C4        14.781515        2.410989        3.989729
+  C3'       14.798530        2.466371        4.421867
+  H3'       14.713267        2.398849        4.414640
+  C2'       14.820069        2.541733        4.291849
+ 1H2'       14.721073        2.533303        4.247020
+  O2'       14.879740        2.668829        4.295744
+ 2HO'       14.853123        2.709278        4.212851
+  O3'       14.777799        2.564872        4.524009
+  H3T       14.774690        2.652156        4.484158
+65
+ generated by VMD
+  H5T       15.206599        1.572029        4.334027
+  O5'       15.143420        1.613157        4.393466
+  C5'       15.204591        1.721485        4.460211
+ 1H5'       15.302390        1.686471        4.493228
+ 2H5'       15.145994        1.749638        4.547704
+  C4'       15.222277        1.839862        4.366666
+  H4'       15.296083        1.905875        4.412229
+  O4'       15.274338        1.807258        4.236629
+  C1'       15.236064        1.914908        4.149570
+  H1'       15.330606        1.960101        4.119559
+  N9        15.180844        1.859583        4.024785
+  C8        15.061384        1.800116        3.998049
+  H8        14.979607        1.784122        4.066754
+  N7        15.068254        1.739242        3.884995
+  C5        15.188682        1.768900        3.825654
+  C6        15.254227        1.730517        3.714267
+  N6        15.197144        1.668724        3.611931
+  H61       15.097814        1.651520        3.618143
+  H62       15.253335        1.657661        3.528737
+  N1        15.380466        1.758555        3.696045
+  C2        15.438334        1.837651        3.782535
+  H2        15.538693        1.860716        3.749975
+  N3        15.386237        1.887903        3.893170
+  C4        15.259776        1.848152        3.907741
+  C3'       15.095292        1.915299        4.334499
+  H3'       15.010189        1.854632        4.303551
+  C2'       15.135535        2.008978        4.223885
+ 1H2'       15.057730        2.028799        4.150165
+  O2'       15.213609        2.121090        4.267405
+ 2HO'       15.162492        2.179283        4.324121
+  O3'       15.074284        1.989543        4.453595
+  P         14.952164        2.082008        4.470041
+  O1P       14.937928        2.104555        4.612800
+  O2P       14.839347        2.013339        4.405234
+  O5'       14.979296        2.222763        4.399913
+  C5'       14.892916        2.333297        4.425739
+ 1H5'       14.942754        2.425604        4.396129
+ 2H5'       14.868936        2.344323        4.531496
+  C4'       14.762017        2.328242        4.346614
+  H4'       14.687031        2.274320        4.404495
+  O4'       14.715004        2.463836        4.325877
+  C1'       14.642533        2.468390        4.202215
+  H1'       14.537689        2.456767        4.229668
+  N9        14.660139        2.595785        4.130150
+  C8        14.765982        2.642884        4.050838
+  H8        14.863976        2.597805        4.045431
+  N7        14.742874        2.750244        3.983194
+  C5        14.609942        2.788411        4.021593
+  C6        14.530947        2.904861        4.004297
+  N6        14.577147        3.017766        3.945367
+  H61       14.671611        3.011784        3.910125
+  H62       14.516243        3.094875        3.922008
+  N1        14.402965        2.905596        4.044899
+  C2        14.356116        2.800324        4.114378
+  H2        14.253911        2.798593        4.149237
+  N3        14.422529        2.684535        4.147936
+  C4        14.554123        2.689645        4.097775
+  C3'       14.759785        2.258157        4.211955
+  H3'       14.862022        2.244558        4.176696
+  C2'       14.688461        2.351718        4.114935
+ 1H2'       14.758523        2.390571        4.041025
+  O2'       14.583876        2.283096        4.050872
+ 2HO'       14.579090        2.200263        4.099160
+  O3'       14.699176        2.129625        4.228271
+  H3T       14.759976        2.086417        4.288707
+65
+ generated by VMD
+  H5T       14.837190        1.731764        4.112862
+  O5'       14.835567        1.749919        4.207116
+  C5'       14.955059        1.709752        4.264459
+ 1H5'       14.994122        1.636536        4.193788
+ 2H5'       14.941088        1.676506        4.367320
+  C4'       15.058025        1.817933        4.264746
+  H4'       15.131132        1.773256        4.332129
+  O4'       15.109529        1.845337        4.136002
+  C1'       15.138430        1.984277        4.128410
+  H1'       15.244407        1.999515        4.107976
+  N9        15.070132        2.041045        4.008841
+  C8        14.990027        1.973175        3.921563
+  H8        14.971063        1.869021        3.942928
+  N7        14.973977        2.028135        3.807398
+  C5        15.029313        2.159239        3.827452
+  C6        15.046778        2.267987        3.746533
+  N6        15.005386        2.286913        3.620339
+  H61       14.965146        2.209908        3.568843
+  H62       15.012429        2.378862        3.579145
+  N1        15.107679        2.380051        3.796883
+  C2        15.159229        2.371572        3.920658
+  H2        15.206707        2.461028        3.958174
+  N3        15.163549        2.271095        4.005847
+  C4        15.083906        2.169787        3.953650
+  C3'       15.001925        1.947165        4.323164
+  H3'       14.907749        1.969282        4.272937
+  C2'       15.103126        2.048822        4.258628
+ 1H2'       15.061421        2.148594        4.244948
+  O2'       15.219626        2.062929        4.344001
+ 2HO'       15.204728        1.999661        4.414649
+  O3'       14.985111        1.958341        4.462287
+  P         14.870660        2.053093        4.523168
+  O1P       14.890329        2.040473        4.667353
+  O2P       14.740082        2.006428        4.468818
+  O5'       14.879867        2.213993        4.478765
+  C5'       15.000712        2.295078        4.505804
+ 1H5'       15.093207        2.240039        4.488584
+ 2H5'       15.007841        2.320825        4.611480
+  C4'       15.016975        2.416266        4.416747
+  H4'       15.119471        2.451387        4.428679
+  O4'       15.012310        2.368953        4.283223
+  C1'       14.885431        2.408447        4.231414
+  H1'       14.909578        2.486356        4.159110
+  N9        14.828690        2.302046        4.146806
+  C8        14.770106        2.182823        4.181708
+  H8        14.763678        2.144738        4.282567
+  N7        14.724207        2.113139        4.078411
+  C5        14.738846        2.199121        3.968082
+  C6        14.685157        2.199994        3.839454
+  N6        14.641347        2.088488        3.783013
+  H61       14.630588        2.003700        3.836828
+  H62       14.617175        2.091475        3.684993
+  N1        14.685962        2.312217        3.769986
+  C2        14.738795        2.421787        3.819924
+  H2        14.743470        2.510610        3.758665
+  N3        14.788108        2.434139        3.945859
+  C4        14.788066        2.318882        4.016373
+  C3'       14.902862        2.517875        4.438258
+  H3'       14.862687        2.518752        4.539581
+  C2'       14.798081        2.468604        4.336346
+ 1H2'       14.725121        2.396255        4.372725
+  O2'       14.722972        2.571498        4.275081
+ 2HO'       14.634643        2.551233        4.306754
+  O3'       14.944068        2.643189        4.387441
+  H3T       14.875720        2.667839        4.324697
+65
+ generated by VMD
+  H5T       15.025169        1.552304        4.239150
+  O5'       15.015182        1.647334        4.248394
+  C5'       15.141973        1.709659        4.291545
+ 1H5'       15.226773        1.672057        4.234307
+ 2H5'       15.151942        1.692474        4.398718
+  C4'       15.149358        1.861311        4.269546
+  H4'       15.244267        1.898158        4.308477
+  O4'       15.161581        1.876086        4.133709
+  C1'       15.109478        2.011050        4.104973
+  H1'       15.190284        2.083412        4.115696
+  N9        15.053378        2.010486        3.974542
+  C8        14.949608        1.939907        3.926584
+  H8        14.888913        1.876979        3.989987
+  N7        14.924644        1.959011        3.802988
+  C5        15.020497        2.055530        3.767023
+  C6        15.033942        2.135434        3.652987
+  N6        14.963105        2.128827        3.540459
+  H61       14.895276        2.054775        3.529661
+  H62       14.982648        2.205026        3.477112
+  N1        15.124428        2.225896        3.646025
+  C2        15.208601        2.243128        3.748986
+  H2        15.282596        2.319057        3.728404
+  N3        15.205704        2.173504        3.865497
+  C4        15.105607        2.076787        3.868620
+  C3'       15.021268        1.940332        4.306942
+  H3'       14.940172        1.869729        4.289064
+  C2'       15.014707        2.060263        4.211948
+ 1H2'       14.914454        2.080501        4.174254
+  O2'       15.075030        2.181257        4.269489
+ 2HO'       15.030182        2.202334        4.351711
+  O3'       15.009482        1.974985        4.448845
+  P         14.874564        2.036387        4.506008
+  O1P       14.874916        2.025261        4.649406
+  O2P       14.763107        1.972056        4.434628
+  O5'       14.872576        2.186512        4.459989
+  C5'       14.934460        2.285841        4.539440
+ 1H5'       15.041437        2.264965        4.540383
+ 2H5'       14.903284        2.279682        4.643705
+  C4'       14.913041        2.433059        4.492862
+  H4'       14.969533        2.501193        4.556480
+  O4'       14.958048        2.439136        4.359442
+  C1'       14.860572        2.499717        4.269575
+  H1'       14.896607        2.576455        4.201065
+  N9        14.789545        2.409343        4.182176
+  C8        14.689377        2.324483        4.217869
+  H8        14.648006        2.309146        4.316444
+  N7        14.635345        2.260870        4.122424
+  C5        14.720469        2.293509        4.015628
+  C6        14.723079        2.253921        3.878083
+  N6        14.635610        2.177386        3.811405
+  H61       14.557884        2.135336        3.860309
+  H62       14.657583        2.147101        3.717592
+  N1        14.824955        2.302587        3.803629
+  C2        14.915488        2.387402        3.852718
+  H2        14.992708        2.423151        3.786211
+  N3        14.912703        2.441381        3.973917
+  C4        14.813445        2.387689        4.050976
+  C3'       14.766768        2.486320        4.491781
+  H3'       14.692776        2.406437        4.486772
+  C2'       14.760048        2.572060        4.368409
+ 1H2'       14.659533        2.583792        4.327912
+  O2'       14.821878        2.698505        4.390593
+ 2HO'       14.758530        2.751030        4.440028
+  O3'       14.742419        2.565792        4.609941
+  H3T       14.715851        2.655730        4.589423
+65
+ generated by VMD
+  H5T       15.179457        1.741104        3.900982
+  O5'       15.103093        1.777243        3.855390
+  C5'       14.987530        1.777926        3.937434
+ 1H5'       14.967832        1.672874        3.958816
+ 2H5'       14.901471        1.825049        3.889954
+  C4'       15.013573        1.864984        4.067422
+  H4'       14.926151        1.861565        4.132435
+  O4'       15.123556        1.807494        4.143685
+  C1'       15.224981        1.903663        4.148447
+  H1'       15.223179        1.944826        4.249360
+  N9        15.361039        1.850121        4.127141
+  C8        15.475318        1.881727        4.191302
+  H8        15.491652        1.957671        4.266335
+  N7        15.580411        1.809070        4.159493
+  C5        15.525576        1.719523        4.067212
+  C6        15.583897        1.619220        3.989246
+  N6        15.711239        1.582531        3.994646
+  H61       15.773693        1.628168        4.059591
+  H62       15.740669        1.513267        3.927286
+  N1        15.511261        1.544478        3.904103
+  C2        15.380339        1.574850        3.909242
+  H2        15.319757        1.529302        3.832304
+  N3        15.313910        1.665238        3.979821
+  C4        15.392190        1.739197        4.052656
+  C3'       15.048454        2.010539        4.040118
+  H3'       15.015978        2.034432        3.938848
+  C2'       15.203575        2.009545        4.043818
+ 1H2'       15.250134        1.981461        3.949349
+  O2'       15.259817        2.131708        4.089610
+ 2HO'       15.228335        2.194728        4.024391
+  O3'       14.996343        2.096307        4.138817
+  P         14.842931        2.132808        4.145224
+  O1P       14.781137        2.038660        4.245789
+  O2P       14.795316        2.136425        4.007238
+  O5'       14.834799        2.286782        4.197085
+  C5'       14.907822        2.391318        4.137965
+ 1H5'       14.892282        2.382674        4.030425
+ 2H5'       15.012506        2.374146        4.163009
+  C4'       14.869848        2.528366        4.193048
+  H4'       14.926466        2.612052        4.152155
+  O4'       14.729959        2.550426        4.171509
+  C1'       14.672099        2.600095        4.289897
+  H1'       14.700438        2.705135        4.296573
+  N9        14.528367        2.588774        4.285998
+  C8        14.441617        2.523839        4.366195
+  H8        14.480587        2.475313        4.454459
+  N7        14.320478        2.508995        4.321265
+  C5        14.317728        2.592256        4.210869
+  C6        14.226738        2.642013        4.115617
+  N6        14.100051        2.607729        4.115785
+  H61       14.057688        2.563380        4.196032
+  H62       14.040026        2.665291        4.058475
+  N1        14.257448        2.722378        4.012711
+  C2        14.384749        2.753088        4.002584
+  H2        14.405425        2.821341        3.921478
+  N3        14.485984        2.723563        4.081551
+  C4        14.445708        2.637608        4.183837
+  C3'       14.897259        2.523561        4.344881
+  H3'       14.964549        2.443965        4.376780
+  C2'       14.751207        2.530166        4.402293
+ 1H2'       14.724064        2.428333        4.430120
+  O2'       14.738544        2.621317        4.511948
+ 2HO'       14.752827        2.572472        4.593349
+  O3'       14.964114        2.644466        4.377091
+  H3T       14.943527        2.658123        4.469858
+65
+ generated by VMD
+  H5T       14.654305        2.132374        4.221466
+  O5'       14.740575        2.107329        4.187611
+  C5'       14.770223        2.001941        4.275386
+ 1H5'       14.682770        1.936934        4.272764
+ 2H5'       14.775343        2.049791        4.373187
+  C4'       14.906325        1.926109        4.267467
+  H4'       14.921304        1.854206        4.348008
+  O4'       14.918378        1.855741        4.144278
+  C1'       15.050409        1.869009        4.097060
+  H1'       15.093917        1.772598        4.070735
+  N9        15.063756        1.958524        3.977970
+  C8        15.001014        2.077048        3.948780
+  H8        14.921307        2.113573        4.011842
+  N7        15.042750        2.133699        3.837873
+  C5        15.134868        2.046790        3.789779
+  C6        15.219357        2.042315        3.676820
+  N6        15.225701        2.128914        3.585231
+  H61       15.156734        2.202191        3.576563
+  H62       15.292335        2.109260        3.511917
+  N1        15.294324        1.944477        3.643432
+  C2        15.297135        1.842909        3.730301
+  H2        15.361707        1.759751        3.706231
+  N3        15.216388        1.823233        3.833624
+  C4        15.141820        1.932562        3.862231
+  C3'       15.032075        2.018549        4.281206
+  H3'       15.013633        2.109337        4.223773
+  C2'       15.133450        1.922012        4.208516
+ 1H2'       15.215766        1.981275        4.168603
+  O2'       15.173163        1.811780        4.291770
+ 2HO'       15.207495        1.851636        4.372073
+  O3'       15.071229        2.043559        4.415228
+  P         15.125263        2.184964        4.445612
+  O1P       15.223106        2.221821        4.341271
+  O2P       15.170097        2.177119        4.584393
+  O5'       15.002633        2.280640        4.437276
+  C5'       14.952951        2.352589        4.547473
+ 1H5'       15.029819        2.424535        4.575695
+ 2H5'       14.940403        2.287287        4.633840
+  C4'       14.815311        2.421970        4.523385
+  H4'       14.772344        2.452369        4.618835
+  O4'       14.831726        2.525931        4.423570
+  C1'       14.726851        2.517488        4.325172
+  H1'       14.695997        2.613786        4.284483
+  N9        14.776684        2.453874        4.202466
+  C8        14.905722        2.422574        4.163041
+  H8        14.991037        2.433325        4.228383
+  N7        14.913260        2.383031        4.039737
+  C5        14.784368        2.388160        3.988065
+  C6        14.729931        2.366491        3.860029
+  N6        14.796456        2.313958        3.757921
+  H61       14.894579        2.291887        3.767181
+  H62       14.755410        2.300120        3.666680
+  N1        14.598265        2.362259        3.847806
+  C2        14.523367        2.396977        3.956361
+  H2        14.417350        2.381791        3.942446
+  N3        14.564178        2.428225        4.081981
+  C4        14.701248        2.426260        4.089754
+  C3'       14.712963        2.332113        4.464497
+  H3'       14.761755        2.268072        4.391019
+  C2'       14.620694        2.431362        4.389987
+ 1H2'       14.553188        2.380996        4.320798
+  O2'       14.547445        2.516590        4.477498
+ 2HO'       14.519221        2.460116        4.549816
+  O3'       14.634567        2.259115        4.558148
+  H3T       14.678012        2.177521        4.584057
+65
+ generated by VMD
+  H5T       15.139706        1.691906        4.119927
+  O5'       15.097111        1.703936        4.205115
+  C5'       15.182992        1.776412        4.292105
+ 1H5'       15.282006        1.732136        4.281291
+ 2H5'       15.149679        1.767901        4.395540
+  C4'       15.173301        1.924599        4.260819
+  H4'       15.238002        1.964472        4.338953
+  O4'       15.213679        1.971138        4.134460
+  C1'       15.118810        2.055132        4.090411
+  H1'       15.163068        2.148366        4.055342
+  N9        15.047133        1.996337        3.972012
+  C8        14.963219        1.891788        3.962123
+  H8        14.930859        1.835465        4.048404
+  N7        14.912887        1.878921        3.839806
+  C5        14.975141        1.974246        3.759592
+  C6        14.970608        2.020350        3.629150
+  N6        14.887928        1.976898        3.538360
+  H61       14.811019        1.915833        3.561963
+  H62       14.880107        2.045012        3.464197
+  N1        15.039798        2.126328        3.587474
+  C2        15.129487        2.187153        3.672333
+  H2        15.185096        2.274561        3.641810
+  N3        15.133538        2.158725        3.803795
+  C4        15.056976        2.051141        3.842332
+  C3'       15.024463        1.968347        4.283900
+  H3'       14.957707        1.886816        4.256019
+  C2'       15.016562        2.087369        4.195502
+ 1H2'       14.919801        2.080824        4.145750
+  O2'       15.053941        2.213798        4.260862
+ 2HO'       14.991995        2.230488        4.332276
+  O3'       14.996555        1.997096        4.419310
+  P         14.848988        2.026661        4.468812
+  O1P       14.836275        1.998193        4.610259
+  O2P       14.759477        1.970746        4.369563
+  O5'       14.839829        2.182147        4.456972
+  C5'       14.883589        2.262748        4.560537
+ 1H5'       14.984632        2.226203        4.578864
+ 2H5'       14.829399        2.234568        4.650816
+  C4'       14.886683        2.416690        4.528798
+  H4'       14.948374        2.475328        4.596890
+  O4'       14.945946        2.432028        4.398474
+  C1'       14.859962        2.510331        4.322206
+  H1'       14.907434        2.588217        4.262530
+  N9        14.813416        2.427613        4.213445
+  C8        14.712191        2.333448        4.227327
+  H8        14.662790        2.314531        4.321485
+  N7        14.679394        2.270390        4.110848
+  C5        14.779932        2.319226        4.024812
+  C6        14.808763        2.289957        3.889882
+  N6        14.743375        2.197997        3.810560
+  H61       14.666928        2.145288        3.850301
+  H62       14.783525        2.182019        3.719271
+  N1        14.899675        2.363994        3.823681
+  C2        14.970441        2.454097        3.887055
+  H2        15.029579        2.520691        3.825966
+  N3        14.955811        2.482905        4.016510
+  C4        14.860398        2.414614        4.081480
+  C3'       14.742235        2.481310        4.524706
+  H3'       14.664103        2.407829        4.505290
+  C2'       14.754819        2.576566        4.405489
+ 1H2'       14.659711        2.576680        4.352241
+  O2'       14.789527        2.710984        4.440497
+ 2HO'       14.723193        2.741502        4.502822
+  O3'       14.725138        2.567655        4.638644
+  H3T       14.673618        2.525369        4.707736
diff --git a/scripts/vim2html.sh b/scripts/vim2html.sh
index e8e891bf2a0d0f9cf8010be40db67135a94008c6..d39c2c23aec3a8e5d909e4cf2c474158e26dd290 100755
--- a/scripts/vim2html.sh
+++ b/scripts/vim2html.sh
@@ -18,7 +18,10 @@ Usage:
 Options can be:
 --annotate-syntax Reports annotated syntax on output.
                   Also reports non-annotated regions on stderr
-
+--pdf             To produce a pdf file with syntax highlighting.
+--crop            Crop the pdf file to the only written part. Usefull to insert the pdf in a LaTex file as image.
+--fs              Specify the fontsize of the pdf output.
+--colors          Specify the color palette. Allowed values are: default/ac
 EOF
   exit 0
 fi
@@ -31,12 +34,30 @@ fi
 annotate_syntax=no
 inputfile=""
 outputfile=""
-
+pdf=no
+crop=no
+fontsize=7.5
+prefix=""
+colors=""
 for opt
 do
-  case "$opt" in
+  prefixopt="$prefix$opt"
+  prefix=""
+  case "$prefixopt" in
     (--annotate-syntax)
       annotate_syntax=yes ;;
+    (--pdf)
+      pdf=yes;;
+    (--crop)
+      crop=yes;;
+    (--fs)
+      prefix="--fs=";;
+    (--fs=*)
+      fontsize="${prefixopt#--fs=}";;
+    (--colors)
+      prefix="--colors=";;
+    (--colors=*)
+      colors="${prefixopt#--colors=}";;
     (-*)
       echo "ERROR: Unknown option $opt. Use -h for help."
       exit 1 ;;
@@ -51,7 +72,6 @@ do
       fi
   esac
 done
-
 temp="$( mktemp -dt plumed.XXXXX)"
 
 {
@@ -64,45 +84,95 @@ fi | tr '\r' ' '
 } | {
 cd $temp
 {
-if [ "$annotate_syntax" = "yes" ]; then
 
-vim - -c ":syntax off" \
+case "$colors" in
+(ac)
+  cat > $temp/colors.vim << EOF
+: set bg=dark
+: hi Statement  ctermfg=5 cterm=bold guifg=#ffff60 gui=none
+: highlight Type ctermfg=2 cterm=bold
+: hi Comment  guifg=#80a0ff ctermfg=darkblue
+: hi Constant ctermfg=red guifg=#ffa0a0 cterm=bold
+EOF
+;;
+(*)
+  echo > $temp/colors.vim
+;;
+esac
+
+if [ "$pdf" = "yes" ]; then
+
+# this is a workaround for a bug in TOhtml
+# that does not color properly when there are empty lines
+  sed "s/^$/ /" |
+  vim - -c ":syntax off" \
+        -c ":source $VIMFILE" \
+        -c ":source $temp/colors.vim" \
+        -c ":set printoptions=header:0" \
+        -c ":set printfont=courier:h$fontsize" \
+        -c ":hardcopy > out.ps" \
+        -c ":q!" 1> /dev/null 2> /dev/null
+else
+  if [ "$annotate_syntax" = "yes" ]; then
+
+    vim - -c ":syntax off" \
       -c ":source $VIMFILE" \
       -c ":call PlumedAnnotateSyntax()" \
       -c ":wq! syntax.log" -c ":q!" 1> /dev/null 2> /dev/null
 
-else
-# this is a workaround for a bug in TOhtml
-# that does not color properly when there are empty lines
-sed "s/^$/ /" |
-vim - -c ":syntax off" \
+  else
+    # this is a workaround for a bug in TOhtml
+    # that does not color properly when there are empty lines
+    sed "s/^$/ /" |
+    vim - -c ":syntax off" \
       -c ":source $VIMFILE" \
+      -c ":source $temp/colors.vim" \
       -c ":let g:html_use_css = 0" \
       -c ":let g:html_no_progress" \
       -c ":let g:html_number_lines = 1" \
       -c ":TOhtml" \
       -c ":wq! out.html" -c ":q!" 1> /dev/null 2> /dev/null
+  fi 
 fi
+
+
 }
 }
 
-if [ "$annotate_syntax" = "yes" ] ; then
+if [ "$pdf" = "yes" ]; then
+  ps2pdf $temp/out.ps $temp/out.pdf
+  output="$temp/out.pdf"
+  if [ "$crop" = "yes" ]; then
+    pdfcrop $output $temp/outcrop.pdf
+    output="$temp/outcrop.pdf"
+  fi
+else
+  if [ "$annotate_syntax" = "yes" ] ; then
+    output="$(awk '{if($1=="ANNOTATION") print}' $temp/syntax.log 2>/dev/null)"
+    error="$(awk '{if($1=="ERROR") print}' $temp/syntax.log >&2 2>/dev/null)"
+  else
+    output="$(cat $temp/out.html | sed 's|^<title>.*</title>$||' | sed 's|^<meta.*$||')"
+  fi
+fi
 
-output="$(awk '{if($1=="ANNOTATION") print}' $temp/syntax.log 2>/dev/null)"
-error="$(awk '{if($1=="ERROR") print}' $temp/syntax.log >&2 2>/dev/null)"
 
+if [ "$pdf" = "yes" ]; then
+  if [ -n "$outputfile" ] ; then
+    cp $output $outputfile
+  else
+    echo "You should specify an output file if using pdf opton!"
+    exit 1;
+  fi
 else
 
-output="$(cat $temp/out.html | sed 's|^<title>.*</title>$||' | sed 's|^<meta.*$||')"
+  if [ -n "$outputfile" ] ; then
+    echo "$output" > "$outputfile"
+  else
+    echo "$output"
+  fi
 
 fi
 
-if [ -n "$outputfile" ] ; then
-  echo "$output" > "$outputfile"
-else
-  echo "$output"
-fi
-
 echo "$error" >&2
 
 if [ -n "$error" ] ; then
diff --git a/src/analysis/PCA.cpp b/src/analysis/PCA.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..624d2609164fa2ef6e80ad580350bdb349554d5f
--- /dev/null
+++ b/src/analysis/PCA.cpp
@@ -0,0 +1,237 @@
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   Copyright (c) 2012-2015 The plumed team
+   (see the PEOPLE file at the root of the distribution for a list of names)
+
+   See http://www.plumed-code.org for more information.
+
+   This file is part of plumed, version 2.
+
+   plumed is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   plumed is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+#include "Analysis.h"
+#include "tools/Matrix.h"
+#include "reference/Direction.h"
+#include "reference/MetricRegister.h"
+#include "reference/ReferenceConfiguration.h"
+#include "reference/ReferenceValuePack.h"
+#include "core/ActionRegister.h"
+
+//+PLUMEDOC ANALYSIS PCA
+/* 
+Perform principal component analysis (PCA) using either the positions of the atoms a large number of collective variables as input.
+
+Principal component analysis is a statistical technique that uses an orthogonal transformation to convert a set of observations of 
+poorly correlated variables into a set of linearly uncorrelated variables.  You can read more about the specifics of this technique
+here: https://en.wikipedia.org/wiki/Principal_component_analysis
+
+When used in context with molecular dynamics simulations a set of frames taken from the trajectory, \f$\{X_i\}\f$, or the values of 
+a number of collective variables which are calculated from the trajectory frames are used as input.  In this second instance your 
+input to the PCA analysis algorithm is thus a set of high-dimensional vectors of collective variables.  However, if
+collective variables are calculated from the positions of the atoms or if the positions are used directly the assumption is that 
+this input trajectory is a set of poorly correlated (high-dimensional) vectors.  After principal component analysis has been 
+performed the output is a set of orthogonal vectors that describe the directions in which the largest motions have been seen.  
+In other words, principal component analysis provides a method for lowering the dimensionality of the data contained in a trajectory.
+These output directions are some linear combination of the \f$x\f$, \f$y\f$ and \f$z\f$ positions if the positions were used as input 
+or some linear combination of the input collective variables if a high-dimensional vector of collective variables was used as input.
+
+As explained on the Wikipedia page you must calculate the average and covariance for each of the input coordinates.  In other words, you must 
+calculate the average structure and the amount the system fluctuates around this average structure.  The problem in doing so when the 
+\f$x\f$, \f$y\f$ and \f$z\f$ coordinates of a molecule are used as input is that the majority of the changes in the positions of the 
+atoms comes from the translational and rotational degrees of freedom of the molecule.  The first six principal components will thus, most likely,
+be uninteresting.  Consequently, to remedy this problem PLUMED provides the functionality to perform an RMSD alignment of the all the structures 
+to be analysed to the first frame in the trajectory.  This can be used to effectively remove translational and/or rotational motions from 
+consideration.  The resulting principal components thus describe vibrational motions of the molecule. 
+
+If you wish to calculate the projection of a trajectory on a set of principal components calculated from this PCA action then the output can be 
+used as input for the \ref PCAVARS action.
+
+\par Examples
+
+The following input instructs PLUMED to perform a principal component analysis in which the covariance matrix is calculated from changes in the positions
+of the first 22 atoms.  The TYPE=OPTIMAL instruction ensures that translational and rotational degrees of freedom are removed from consideration.
+The first two principal components will be output to a file called pca-comp.pdb.  Trajectory frames will be collected on every step and the PCA calculation
+will be performed at the end of the simulation.
+
+\verbatim
+PCA METRIC=OPTIMAL ATOMS=1-22 STRIDE=1 USE_ALL_DATA NLOW_DIM=2 OFILE=pca-comp.pdb
+\endverbatim
+
+The following input instructs PLUMED to perform a principal component analysis in which the covariance matrix is calculated from chnages in the six distances
+seen in the previous lines.  Notice that here the TYPE=EUCLIDEAN keyword is used to indicate that no alighment has to be done when calculating the various
+elements of the covariance matrix from the input vectors.  In this calculation the first two principal components will be output to a file called pca-comp.pdb.
+Trajectory frames will be collected every five steps and the PCA calculation is performed every 1000 steps.  Consequently, if you run a 2000 step simulation the 
+PCA analysis will be performed twice.  The REWEIGHT_BIAS keyword in this input tells PLUMED that rather that ascribing a weight of one to each of the frames
+when calculating averages and covariances a reweighting should be performed based and each frames' weight in these calculations should be determined based on 
+the current value of the instantaneous bias (see \ref reweighting).  
+
+\verbatim
+d1: DISTANCE ATOMS=1,2
+d2: DISTANCE ATOMS=1,3
+d3: DISTANCE ATOMS=1,4
+d4: DISTNACE ATOMS=2,3
+d5: DISTANCE ATOMS=2,4
+d6: DISTANCE ATOMS=3,4
+
+PCA ARG=d1,d2,d3,d4,d5,d6 METRIC=EUCLIDEAN STRIDE=5 RUN=1000 NLOW_DIM=2 REWEIGHT_BIAS OFILE=pca-comp.pdb
+\endverbatim
+
+*/
+//+ENDPLUMEDOC
+
+namespace PLMD {
+namespace analysis {
+
+class PCA : public Analysis {
+private:
+  unsigned ndim;
+/// The position of the reference configuration (the one we align to)
+  ReferenceConfiguration* myref;
+/// The eigenvectors for the atomic displacements
+  Matrix<Vector> atom_eigv;
+/// The eigenvectors for the displacements in argument space
+  Matrix<double> arg_eigv;
+  std::string ofilename;
+public:
+  static void registerKeywords( Keywords& keys );
+  explicit PCA(const ActionOptions&ao);
+  ~PCA();
+  void performAnalysis();
+  void performTask( const unsigned& , const unsigned& , MultiValue& ) const { plumed_error(); }
+};
+
+PLUMED_REGISTER_ACTION(PCA,"PCA")
+
+void PCA::registerKeywords( Keywords& keys ){
+  Analysis::registerKeywords( keys );
+  keys.add("compulsory","NLOW_DIM","number of PCA coordinates required");
+  keys.add("compulsory","OFILE","the file on which to output the eigenvectors");
+}
+
+PCA::PCA(const ActionOptions&ao):
+PLUMED_ANALYSIS_INIT(ao)
+{
+  // Setup reference configuration
+  log.printf("  performing PCA analysis using %s metric \n", getMetricName().c_str() );
+  myref = metricRegister().create<ReferenceConfiguration>( getMetricName() );
+  std::vector<std::string> argnames( getNumberOfArguments() );
+  for(unsigned i=0;i<argnames.size();++i){
+     if( getArguments()[i]->isPeriodic() ) error("cannot run PCA with periodic variables");
+     argnames[i] = getArguments()[i]->getName();
+  }
+  myref->setNamesAndAtomNumbers( getAbsoluteIndexes(), argnames );
+
+  parse("NLOW_DIM",ndim);
+  if( getNumberOfAtoms()>0 ) atom_eigv.resize( ndim, getNumberOfAtoms() );
+  if( getNumberOfArguments()>0 ) arg_eigv.resize( ndim, getNumberOfArguments() );
+
+  // Read stuff for output file
+  parseOutputFile("OFILE",ofilename); 
+  checkRead();
+}
+
+PCA::~PCA(){
+  delete myref;
+}
+
+void PCA::performAnalysis(){
+  // Align everything to the first frame
+  MultiValue myval( 1, getNumberOfArguments() + 3*getNumberOfAtoms() + 9 );
+  ReferenceValuePack mypack( getNumberOfArguments(), getNumberOfAtoms(), myval );
+  for(unsigned i=0;i<getNumberOfAtoms();++i) mypack.setAtomIndex( i, i );
+  // Setup some PCA storage 
+  data[0]->setupPCAStorage ( mypack );
+
+  // Create some arrays to store the average position
+  std::vector<double> sarg( getNumberOfArguments(), 0 );
+  std::vector<Vector> spos( getNumberOfAtoms() );
+  for(unsigned i=0;i<getNumberOfAtoms();++i) spos[i].zero();  
+ 
+  // Calculate the average displacement from the first frame 
+  for(unsigned i=1;i<getNumberOfDataPoints();++i){
+      double d = data[0]->calc( data[i]->getReferencePositions(), getPbc(), getArguments(), data[i]->getReferenceArguments(), mypack, true );
+      // Accumulate average displacement of arguments (Here PBC could do fucked up things - really needs Berry Phase ) GAT
+      for(unsigned j=0;j<getNumberOfArguments();++j) sarg[j] += 0.5*getWeight(i)*mypack.getArgumentDerivative(j); 
+      // Accumulate average displacement of position
+      for(unsigned j=0;j<getNumberOfAtoms();++j) spos[j] += getWeight(i)*mypack.getAtomsDisplacementVector()[j];
+  }
+  // Now normalise the displacements to get the average and add these to the first frame
+  double inorm = 1.0 / getNormalization(); 
+  for(unsigned j=0;j<getNumberOfArguments();++j) sarg[j] = inorm*sarg[j] + data[0]->getReferenceArguments()[j];
+  for(unsigned j=0;j<getNumberOfAtoms();++j) spos[j] = inorm*spos[j] + data[0]->getReferencePositions()[j]; 
+  // And set the reference configuration
+  std::vector<double> empty( getNumberOfArguments(), 1.0 ); myref->setReferenceConfig( spos, sarg, empty ); 
+
+  // Now accumulate the covariance
+  unsigned narg=getNumberOfArguments();
+  Matrix<double> covar( getNumberOfArguments()+3*getNumberOfAtoms(), getNumberOfArguments()+3*getNumberOfAtoms() ); covar=0;
+  for(unsigned i=0;i<getNumberOfDataPoints();++i){
+      // double d = data[i]->calc( spos, getPbc(), getArguments(), sarg, mypack, true );
+      double d = data[0]->calc( data[i]->getReferencePositions(), getPbc(), getArguments(), data[i]->getReferenceArguments(), mypack, true );
+      for(unsigned jarg=0;jarg<getNumberOfArguments();++jarg){
+         // Need sorting for PBC with GAT 
+         double jarg_d = 0.5*mypack.getArgumentDerivative(jarg) + data[0]->getReferenceArguments()[jarg] - sarg[jarg];
+         for(unsigned karg=0;karg<getNumberOfArguments();++karg){
+            // Need sorting for PBC with GAT 
+            double karg_d = 0.5*mypack.getArgumentDerivative(karg) + data[0]->getReferenceArguments()[karg] - sarg[karg];
+            covar( jarg, karg ) += 0.25*getWeight(i)*jarg_d*karg_d; // mypack.getArgumentDerivative(jarg)*mypack.getArgumentDerivative(karg); 
+         }
+      }
+      for(unsigned jat=0;jat<getNumberOfAtoms();++jat){ 
+        for(unsigned jc=0;jc<3;++jc){
+             double jdisplace = mypack.getAtomsDisplacementVector()[jat][jc] + data[0]->getReferencePositions()[jat][jc] - spos[jat][jc];
+             for(unsigned kat=0;kat<getNumberOfAtoms();++kat){ 
+                 for(unsigned kc=0;kc<3;++kc){
+                    double kdisplace = mypack.getAtomsDisplacementVector()[kat][kc] + data[0]->getReferencePositions()[kat][kc] - spos[kat][kc];
+                    covar( narg+3*jat + jc, narg+3*kat + kc ) += getWeight(i)*jdisplace*kdisplace; 
+                 }
+             }
+         }
+      }
+  }
+  // Normalise
+  for(unsigned i=0;i<covar.nrows();++i){
+      for(unsigned j=0;j<covar.ncols();++j) covar(i,j) *= inorm; 
+  }
+
+  // Diagonalise the covariance
+  std::vector<double> eigval( getNumberOfArguments()+3*getNumberOfAtoms() );
+  Matrix<double> eigvec( getNumberOfArguments()+3*getNumberOfAtoms(), getNumberOfArguments()+3*getNumberOfAtoms() );
+  diagMat( covar, eigval, eigvec );
+
+  // Open an output file
+  OFile ofile; ofile.link(*this); ofile.setBackupString("analysis");
+  ofile.open( ofilename ); 
+  // Output the reference configuration
+  myref->print( ofile, getOutputFormat(), atoms.getUnits().getLength()/0.1 );   
+
+  // Store and print the eigenvectors
+  std::vector<Vector> tmp_atoms( getNumberOfAtoms() );
+  std::vector<double> tmp_args( getNumberOfArguments() );
+  Direction* tref = metricRegister().create<Direction>( "DIRECTION" );
+  tref->setNamesAndAtomNumbers( getAbsoluteIndexes(), argument_names );
+  for(unsigned dim=0;dim<ndim;++dim){
+     unsigned idim = covar.ncols() - 1 - dim;
+     for(unsigned i=0;i<getNumberOfArguments();++i) tmp_args[i]=arg_eigv(dim,i)=eigvec(idim,i);
+     for(unsigned i=0;i<getNumberOfAtoms();++i){
+         for(unsigned k=0;k<3;++k) tmp_atoms[i][k]=atom_eigv(dim,i)[k]=eigvec(idim,narg+3*i+k);
+     }  
+     tref->setDirection( tmp_atoms, tmp_args );
+     tref->print( ofile, getOutputFormat(), atoms.getUnits().getLength()/0.1 ); 
+  } 
+  // Close the output file   
+  delete tref; ofile.close();
+}
+
+}
+}
diff --git a/src/bias/External.cpp b/src/bias/External.cpp
index 0a8193466426b2588940e6d34a6e52cdde3b73a2..b01c47fc642d7aeadcdfdd6f07dd19fa1c5e6a28 100644
--- a/src/bias/External.cpp
+++ b/src/bias/External.cpp
@@ -41,7 +41,7 @@ The following is an input for a calculation with an external potential that is
 defined in the file bias.dat and that acts on the distance between atoms 3 and 5.
 \verbatim
 DISTANCE ATOMS=3,5 LABEL=d1
-EXTERNAL ARG=d1 FILENAME=bias.dat LABEL=external 
+EXTERNAL ARG=d1 FILE=bias.dat LABEL=external 
 \endverbatim
 (See also \ref DISTANCE \ref PRINT).
 
@@ -64,7 +64,7 @@ potential acting on two torsional angles:
 \verbatim
 TORSION ATOMS=4,5,6,7 LABEL=t1
 TORSION ATOMS=6,7,8,9 LABEL=t2
-EXTERNAL ARG=t1,t2 FILENAME=bias.dat LABEL=ext
+EXTERNAL ARG=t1,t2 FILE=bias.dat LABEL=ext
 \endverbatim
 
 The header in the file bias.dat for this calculation would read:
diff --git a/src/bias/MetaD.cpp b/src/bias/MetaD.cpp
index d6c2c1a6c52608e82b804b60a0aa278e1279e6e4..972f62b143fc8ad6b631fffe8faf82b2b1cc26c8 100644
--- a/src/bias/MetaD.cpp
+++ b/src/bias/MetaD.cpp
@@ -85,6 +85,10 @@ In case you do not provide any information about bin size (neither GRID_BIN nor
 and if Gaussian width is fixed PLUMED will use 1/5 of the Gaussian width as grid spacing.
 This default choice should be reasonable for most applications.
 
+Metadynamics can be restarted either from a HILLS file as well as from a GRID, in this second
+case one can first save a GRID using GRID_WFILE (and GRID_WSTRIDE) and at a later stage read
+it using GRID_RFILE.
+
 Another option that is available in plumed is well-tempered metadynamics \cite Barducci:2008. In this
 varient of metadynamics the heights of the Gaussian hills are rescaled at each step so the bias is now
 given by:
@@ -256,6 +260,51 @@ factor that can then be used to determine the rate. This method can be used toge
 with \ref COMMITTOR analysis to stop the simulation when the system get to the target basin.
 It must be used together with Well-Tempered Metadynamics.
 
+\par
+You can also provide a target distribution using the keyword TARGET
+\cite white2015designing
+\cite marinelli2015ensemble
+\cite gil2016empirical
+The TARGET should be a grid containing a free-energy (i.e. the -kbT*log of the desired target distribution).
+Gaussians will then be scaled by a factor
+\f[
+e^{\beta(\tilde{F}(s)-\tilde{F}_{max})}
+\f]
+Here \f$\tilde{F}(s)\f$ is the free energy defined on the grid and \f$\tilde{F}_{max}\f$ its maximum value.
+Notice that we here used the maximum value as in ref \cite gil2016empirical
+This choice allows to avoid exceedingly large Gaussians to be added. However,
+it could make the Gaussian too small. You should always choose carefully the HEIGHT parameter
+in this case.
+The grid file should be similar to other PLUMED grid files in that it should contain
+both the target free-energy and its derivatives.
+
+Notice that if you wish your simulation to converge to the target free energy you should use
+the DAMPFACTOR command to provide a global tempering \cite dama2014well
+Alternatively, if you use a BIASFACTOR yout simulation will converge to a free
+energy that is a linear combination of the target free energy and of the intrinsic free energy
+determined by the original force field.
+
+\verbatim
+DISTANCE ATOMS=3,5 LABEL=d1
+METAD ...
+ LABEL=t1
+ ARG=d1 SIGMA=0.05 TAU=200 DAMPFACTOR=100 PACE=250
+ GRID_MIN=0 GRID_MAX=2 GRID_BIN=200
+ TARGET=dist.dat
+... METAD
+
+PRINT ARG=d1,t1.bias STRIDE=100 FILE=COLVAR
+\endverbatim
+
+The header in the file dist.dat for this calculation would read:
+
+\verbatim
+#! FIELDS d1 t1.target der_d1
+#! SET min_d1 0
+#! SET max_d1 2
+#! SET nbins_d1  200
+#! SET periodic_d1 false
+\endverbatim
 
 */
 //+ENDPLUMEDOC
@@ -272,7 +321,7 @@ private:
    Gaussian(const vector<double> & center,const vector<double> & sigma,double height, bool multivariate ):
      center(center),sigma(sigma),height(height),multivariate(multivariate),invsigma(sigma){
        // to avoid troubles from zero element in flexible hills
-       for(unsigned i=0;i<invsigma.size();++i)abs(invsigma[i])>1.e-20?invsigma[i]=1.0/invsigma[i]:0.; 
+       for(unsigned i=0;i<invsigma.size();++i) abs(invsigma[i])>1.e-20?invsigma[i]=1.0/invsigma[i]:0.; 
      }
   };
   vector<double> sigma0_;
@@ -282,13 +331,14 @@ private:
   OFile hillsOfile_;
   OFile gridfile_;
   Grid* BiasGrid_;
-  Grid* ExtGrid_;
   bool storeOldGrids_;
-  std::string gridfilename_,gridreadfilename_;
   int wgridstride_; 
-  bool grid_,hasextgrid_;
+  bool grid_;
   double height0_;
   double biasf_;
+  double dampfactor_;
+  std::string targetfilename_;
+  Grid* TargetGrid_;
   double kbt_;
   int stride_;
   bool welltemp_;
@@ -309,12 +359,10 @@ private:
   bool doInt_;
   bool isFirstStep;
   double reweight_factor;
-  std::vector<unsigned> rewf_grid_; 
-  unsigned int rewf_ustride_;
-/// accumulator for work
+  vector<unsigned> rewf_grid_; 
+  unsigned rewf_ustride_;
   double work_;
   long int last_step_warn_grid;
-  
  
   void   readGaussians(IFile*);
   bool   readChunkOfGaussians(IFile *ifile, unsigned n);
@@ -326,9 +374,9 @@ private:
   void   finiteDifferenceGaussian(const vector<double>&, const Gaussian&);
   double getGaussianNormalization( const Gaussian& );
   vector<unsigned> getGaussianSupport(const Gaussian&);
-  bool   scanOneHill(IFile *ifile,  vector<Value> &v, vector<double> &center, vector<double>  &sigma, double &height, bool &multivariate  );
+  bool   scanOneHill(IFile *ifile,  vector<Value> &v, vector<double> &center, vector<double>  &sigma, double &height, bool &multivariate);
   void   computeReweightingFactor();
-  std::string fmt;
+  string fmt;
 
 public:
   explicit MetaD(const ActionOptions&);
@@ -345,7 +393,8 @@ void MetaD::registerKeywords(Keywords& keys){
   Bias::registerKeywords(keys);
   componentsAreNotOptional(keys);
   keys.addOutputComponent("bias","default","the instantaneous value of the bias potential");
-  keys.addOutputComponent("rbias","REWEIGHTING_NGRID","the instantaneous value of the bias normalized using the \\f$c(t)\\f$ reweighting factor [rbias=bias-c(t)]. This component can be used to obtain a reweighted histogram.");
+  keys.addOutputComponent("rbias","REWEIGHTING_NGRID","the instantaneous value of the bias normalized using the \\f$c(t)\\f$ reweighting factor [rbias=bias-c(t)]."
+                                                      "This component can be used to obtain a reweighted histogram.");
   keys.addOutputComponent("rct","REWEIGHTING_NGRID","the reweighting factor \\f$c(t)\\f$.");
   keys.addOutputComponent("work","default","accumulator for work");
   keys.addOutputComponent("acc","ACCELERATION","the metadynamics acceleration factor");
@@ -353,21 +402,28 @@ void MetaD::registerKeywords(Keywords& keys){
   keys.add("compulsory","SIGMA","the widths of the Gaussian hills");
   keys.add("compulsory","PACE","the frequency for hill addition");
   keys.add("compulsory","FILE","HILLS","a file in which the list of added hills is stored");
-  keys.add("optional","HEIGHT","the heights of the Gaussian hills. Compulsory unless TAU, TEMP and BIASFACTOR are given");
+  keys.add("optional","HEIGHT","the heights of the Gaussian hills. Compulsory unless TAU and either BIASFACTOR or DAMPFACTOR are given");
   keys.add("optional","FMT","specify format for HILLS files (useful for decrease the number of digits in regtests)");
   keys.add("optional","BIASFACTOR","use well tempered metadynamics and use this biasfactor.  Please note you must also specify temp");
+  keys.add("optional","DAMPFACTOR","damp hills with exp(-max(V)/(kbT*DAMPFACTOR)");
+  keys.add("optional","TARGET","target to a predefined distribution");
   keys.add("optional","TEMP","the system temperature - this is only needed if you are doing well-tempered metadynamics");
   keys.add("optional","TAU","in well tempered metadynamics, sets height to (kb*DeltaT*pace*timestep)/tau");
   keys.add("optional","GRID_MIN","the lower bounds for the grid");
   keys.add("optional","GRID_MAX","the upper bounds for the grid");
   keys.add("optional","GRID_BIN","the number of bins for the grid");
   keys.add("optional","GRID_SPACING","the approximate grid spacing (to be used as an alternative or together with GRID_BIN)");
-  keys.add("optional","REWEIGHTING_NGRID","calculate the c(t) reweighting factor and use that to obtain the normalized bias [rbias=bias-c(t)]. Here you should specify the number of grid points required in each dimension. The number of grid points should be equal or larger to the number of grid points given in GRID_BIN." "This method is not compatible with metadynamics not on a grid.");
-  keys.add("optional","REWEIGHTING_NHILLS","how many Gaussian hills should be deposited between calculating the c(t) reweighting factor. The default is to do this every 50 hills.");
+  keys.add("optional","REWEIGHTING_NGRID","calculate the c(t) reweighting factor and use that to obtain the normalized bias [rbias=bias-c(t)]."
+                                          "Here you should specify the number of grid points required in each dimension."
+                                          "The number of grid points should be equal or larger to the number of grid points given in GRID_BIN." 
+                                          "This method is not compatible with metadynamics not on a grid.");
+  keys.add("optional","REWEIGHTING_NHILLS","how many Gaussian hills should be deposited between calculating the c(t) reweighting factor."
+                                           "The default is to do this every 50 hills.");
   keys.addFlag("GRID_SPARSE",false,"use a sparse grid to store hills");
   keys.addFlag("GRID_NOSPLINE",false,"don't use spline interpolation with grids");
   keys.add("optional","GRID_WSTRIDE","write the grid to a file every N steps");
   keys.add("optional","GRID_WFILE","the file on which to write the grid");
+  keys.add("optional","GRID_RFILE","a grid file from which the bias should be read at the initial step of the simulation");
   keys.addFlag("STORE_GRIDS",false,"store all the grid files the calculation generates. They will be deleted if this keyword is not present");
   keys.add("optional","ADAPTIVE","use a geometric (=GEOM) or diffusion (=DIFF) based hills width scheme. Sigma is one number that has distance units or timestep dimensions");
   keys.add("optional","WALKERS_ID", "walker id");
@@ -375,7 +431,6 @@ void MetaD::registerKeywords(Keywords& keys){
   keys.add("optional","WALKERS_DIR", "shared directory with the hills files from all the walkers");
   keys.add("optional","WALKERS_RSTRIDE","stride for reading hills files");
   keys.add("optional","INTERVAL","monodimensional lower and upper limits, outside the limits the system will not feel the biasing force.");
-  keys.add("optional","GRID_RFILE","a grid file from which the bias should be read at the initial step of the simulation");
   keys.add("optional","SIGMA_MAX","the upper bounds for the sigmas (in CV units) when using adaptive hills. Negative number means no bounds ");
   keys.add("optional","SIGMA_MIN","the lower bounds for the sigmas (in CV units) when using adaptive hills. Negative number means no bounds ");
   keys.addFlag("WALKERS_MPI",false,"Switch on MPI version of multiple walkers - not compatible with other WALKERS_* options");
@@ -388,6 +443,7 @@ void MetaD::registerKeywords(Keywords& keys){
 MetaD::~MetaD(){
   if(flexbin) delete flexbin;
   if(BiasGrid_) delete BiasGrid_;
+  if(TargetGrid_) delete TargetGrid_;
   hillsOfile_.close();
   if(wgridstride_>0) gridfile_.close();
   delete [] dp_;
@@ -401,9 +457,10 @@ MetaD::~MetaD(){
 MetaD::MetaD(const ActionOptions& ao):
 PLUMED_BIAS_INIT(ao),
 // Grid stuff initialization
-BiasGrid_(NULL),ExtGrid_(NULL), wgridstride_(0), grid_(false), hasextgrid_(false),
+BiasGrid_(NULL), wgridstride_(0), grid_(false),
 // Metadynamics basic parameters
-height0_(std::numeric_limits<double>::max()), biasf_(1.0), kbt_(0.0),
+height0_(std::numeric_limits<double>::max()), biasf_(1.0), dampfactor_(0.0), TargetGrid_(NULL),
+kbt_(0.0),
 stride_(0), welltemp_(false),
 // Other stuff
 dp_(NULL), adaptive_(FlexibleBin::none),
@@ -424,55 +481,54 @@ last_step_warn_grid(0)
   string adaptiveoption;
   adaptiveoption="NONE";
   parse("ADAPTIVE",adaptiveoption);
-  if (adaptiveoption=="GEOM"){
-		  log.printf("  Uses Geometry-based hills width: sigma must be in distance units and only one sigma is needed\n");
-		  adaptive_=FlexibleBin::geometry;	
-  }else if (adaptiveoption=="DIFF"){
-		  log.printf("  Uses Diffusion-based hills width: sigma must be in timesteps and only one sigma is needed\n");
-		  adaptive_=FlexibleBin::diffusion;	
-  }else if (adaptiveoption=="NONE"){
-		  adaptive_=FlexibleBin::none;	
-  }else{
-		  error("I do not know this type of adaptive scheme");	
+  if(adaptiveoption=="GEOM"){
+    log.printf("  Uses Geometry-based hills width: sigma must be in distance units and only one sigma is needed\n");
+    adaptive_=FlexibleBin::geometry;	
+  } else if(adaptiveoption=="DIFF"){
+    log.printf("  Uses Diffusion-based hills width: sigma must be in timesteps and only one sigma is needed\n");
+    adaptive_=FlexibleBin::diffusion;	
+  } else if(adaptiveoption=="NONE"){
+    adaptive_=FlexibleBin::none;	
+  } else {
+    error("I do not know this type of adaptive scheme");	
   }
   // parse the sigma
   parseVector("SIGMA",sigma0_);
 
   parse("FMT",fmt);
 
-  if (adaptive_==FlexibleBin::none){
-         // if you use normal sigma you need one sigma per argument 
-         if( sigma0_.size()!=getNumberOfArguments() ) error("number of arguments does not match number of SIGMA parameters");
-  }else{
-         // if you use flexible hills you need one sigma  
-         if(sigma0_.size()!=1){
-        	 error("If you choose ADAPTIVE you need only one sigma according to your choice of type (GEOM/DIFF)");
-         } 
-	 // if adaptive then the number must be an integer
- 	 if(adaptive_==FlexibleBin::diffusion){
-		if(int(sigma0_[0])-sigma0_[0]>1.e-9 || int(sigma0_[0])-sigma0_[0] <-1.e-9 || int(sigma0_[0])<1 ){
-		 	error("In case of adaptive hills with diffusion, the sigma must be an integer which is the number of timesteps\n");	
-		} 
-	 } 
-	 // here evtl parse the sigma min and max values
-	 
-	 parseVector("SIGMA_MIN",sigma0min_);
-	 if(sigma0min_.size()>0 && sigma0min_.size()<getNumberOfArguments()) {
-           error("the number of SIGMA_MIN values be at least the number of the arguments");
-	 } else if(sigma0min_.size()==0) { 
-           sigma0min_.resize(getNumberOfArguments());
-           for(unsigned i=0;i<getNumberOfArguments();i++){sigma0min_[i]=-1.;}	
-         } 
-
-	 parseVector("SIGMA_MAX",sigma0max_);
-	 if(sigma0max_.size()>0 && sigma0max_.size()<getNumberOfArguments()) {
-           error("the number of SIGMA_MAX values be at least the number of the arguments"); 
-         } else if(sigma0max_.size()==0) { 
-           sigma0max_.resize(getNumberOfArguments());
-           for(unsigned i=0;i<getNumberOfArguments();i++){sigma0max_[i]=-1.;}	
-         } 
-
-         flexbin=new FlexibleBin(adaptive_,this,sigma0_[0],sigma0min_,sigma0max_);
+  if(adaptive_==FlexibleBin::none){
+    // if you use normal sigma you need one sigma per argument 
+    if( sigma0_.size()!=getNumberOfArguments() ) error("number of arguments does not match number of SIGMA parameters");
+  } else {
+    // if you use flexible hills you need one sigma  
+    if(sigma0_.size()!=1){
+      error("If you choose ADAPTIVE you need only one sigma according to your choice of type (GEOM/DIFF)");
+    } 
+    // if adaptive then the number must be an integer
+    if(adaptive_==FlexibleBin::diffusion){
+      if(int(sigma0_[0])-sigma0_[0]>1.e-9 || int(sigma0_[0])-sigma0_[0] <-1.e-9 || int(sigma0_[0])<1 ){
+       	error("In case of adaptive hills with diffusion, the sigma must be an integer which is the number of timesteps\n");	
+      } 
+    } 
+    // here evtl parse the sigma min and max values
+    parseVector("SIGMA_MIN",sigma0min_);
+    if(sigma0min_.size()>0 && sigma0min_.size()<getNumberOfArguments()) {
+      error("the number of SIGMA_MIN values be at least the number of the arguments");
+    } else if(sigma0min_.size()==0) { 
+      sigma0min_.resize(getNumberOfArguments());
+      for(unsigned i=0;i<getNumberOfArguments();i++){sigma0min_[i]=-1.;}	
+    }
+ 
+    parseVector("SIGMA_MAX",sigma0max_);
+    if(sigma0max_.size()>0 && sigma0max_.size()<getNumberOfArguments()) {
+      error("the number of SIGMA_MAX values be at least the number of the arguments"); 
+    } else if(sigma0max_.size()==0) { 
+      sigma0max_.resize(getNumberOfArguments());
+      for(unsigned i=0;i<getNumberOfArguments();i++){sigma0max_[i]=-1.;}	
+    }
+ 
+    flexbin=new FlexibleBin(adaptive_,this,sigma0_[0],sigma0min_,sigma0max_);
   }
   // note: HEIGHT is not compulsory, since one could use the TAU keyword, see below
   parse("HEIGHT",height0_);
@@ -482,6 +538,7 @@ last_step_warn_grid(0)
   parse("FILE",hillsfname);
   parse("BIASFACTOR",biasf_);
   if( biasf_<1.0 ) error("well tempered bias factor is nonsensical");
+  parse("DAMPFACTOR",dampfactor_);
   double temp=0.0;
   parse("TEMP",temp);
   if(temp>0.0) kbt_=plumed.getAtoms().getKBoltzmann()*temp;
@@ -490,16 +547,23 @@ last_step_warn_grid(0)
     if(kbt_==0.0) error("Unless the MD engine passes the temperature to plumed, with well-tempered metad you must specify it using TEMP");
     welltemp_=true;
   }
+  if(dampfactor_>0.0){
+    if(kbt_==0.0) error("Unless the MD engine passes the temperature to plumed, with damped metad you must specify it using TEMP");
+  }
+  parse("TARGET",targetfilename_);
+  if(targetfilename_.length()>0 && kbt_==0.0)  error("with TARGET temperature must be specified");
   double tau=0.0;
   parse("TAU",tau);
   if(tau==0.0){
     if(height0_==std::numeric_limits<double>::max()) error("At least one between HEIGHT and TAU should be specified");
     // if tau is not set, we compute it here from the other input parameters
     if(welltemp_) tau=(kbt_*(biasf_-1.0))/height0_*getTimeStep()*stride_;
+    else if(dampfactor_>0.0) tau=(kbt_*dampfactor_)/height0_*getTimeStep()*stride_;
   } else {
-    if(!welltemp_)error("TAU only makes sense in well-tempered metadynamics");
     if(height0_!=std::numeric_limits<double>::max()) error("At most one between HEIGHT and TAU should be specified");
-    height0_=(kbt_*(biasf_-1.0))/tau*getTimeStep()*stride_;
+    if(welltemp_) height0_=(kbt_*(biasf_-1.0))/tau*getTimeStep()*stride_;
+    else if(dampfactor_>0.0) height0_=(kbt_*dampfactor_)/tau*getTimeStep()*stride_;
+    else error("TAU only makes sense in well-tempered or damped metadynamics");
   }
   
   // Grid Stuff
@@ -556,6 +620,7 @@ last_step_warn_grid(0)
   bool spline=!nospline;
   if(gbin.size()>0){grid_=true;}
   parse("GRID_WSTRIDE",wgridstride_);
+  string gridfilename_;
   parse("GRID_WFILE",gridfilename_); 
   parseFlag("STORE_GRIDS",storeOldGrids_);
   if(grid_ && gridfilename_.length()>0){
@@ -565,20 +630,26 @@ last_step_warn_grid(0)
   if(grid_ && wgridstride_>0){
     if(gridfilename_.length()==0) error("grid filename not specified use GRID_WFILE"); 
   }
-
+  string gridreadfilename_;
   parse("GRID_RFILE",gridreadfilename_);
 
+  if(!grid_&&gridfilename_.length()> 0) error("To write a grid you need first to define it!");
+  if(!grid_&&gridreadfilename_.length()>0) error("To read a grid you need first to define it!");
+ 
   if(grid_){ 
-     parseVector("REWEIGHTING_NGRID",rewf_grid_); 
-     if( rewf_grid_.size()>0 && rewf_grid_.size()!=getNumberOfArguments() ){
-         error("size mismatch for REWEIGHTING_NGRID keyword");
-     } else if( rewf_grid_.size()==getNumberOfArguments() ){
-         for(unsigned j=0;j<getNumberOfArguments();++j){
-            if( !getPntrToArgument(j)->isPeriodic() ) rewf_grid_[j] += 1; 
-         }
-     }
-     if( adaptive_==FlexibleBin::diffusion || adaptive_==FlexibleBin::geometry) warning("reweighting has not been proven to work with adaptive Gaussians");
-     rewf_ustride_=50; parse("REWEIGHTING_NHILLS",rewf_ustride_);
+    parseVector("REWEIGHTING_NGRID",rewf_grid_); 
+    if(rewf_grid_.size()>0 && rewf_grid_.size()!=getNumberOfArguments()){
+      error("size mismatch for REWEIGHTING_NGRID keyword");
+    } else if(rewf_grid_.size()==getNumberOfArguments()){
+      for(unsigned j=0;j<getNumberOfArguments();++j){
+        if( !getPntrToArgument(j)->isPeriodic() ) rewf_grid_[j] += 1; 
+      }
+    }
+    if(adaptive_==FlexibleBin::diffusion || adaptive_==FlexibleBin::geometry) warning("reweighting has not been proven to work with adaptive Gaussians");
+    rewf_ustride_=50; parse("REWEIGHTING_NHILLS",rewf_ustride_);
+  }
+  if(dampfactor_>0.0){
+    if(!grid_) error("With DAMPFACTOR you should use grids");
   }
 
   // Multiple walkers
@@ -600,6 +671,7 @@ last_step_warn_grid(0)
     uppI_=tmpI.at(1);
     if(getNumberOfArguments()!=1) error("INTERVAL limits correction works only for monodimensional metadynamics!");
     if(uppI_<lowI_) error("The Upper limit must be greater than the Lower limit!");
+    if(getPntrToArgument(0)->isPeriodic()) error("INTERVAL cannot be used with periodic variables!");
     doInt_=true;
   }
 
@@ -622,38 +694,36 @@ last_step_warn_grid(0)
   }
   if(doInt_) log.printf("  Upper and Lower limits boundaries for the bias are activated at %f - %f\n", lowI_, uppI_);
   if(grid_){
-   log.printf("  Grid min");
-   for(unsigned i=0;i<gmin.size();++i) log.printf(" %s",gmin[i].c_str() );
-   log.printf("\n");
-   log.printf("  Grid max");
-   for(unsigned i=0;i<gmax.size();++i) log.printf(" %s",gmax[i].c_str() );
-   log.printf("\n");
-   log.printf("  Grid bin");
-   for(unsigned i=0;i<gbin.size();++i) log.printf(" %u",gbin[i]);
-   log.printf("\n");
-   if(spline){log.printf("  Grid uses spline interpolation\n");}
-   if(sparsegrid){log.printf("  Grid uses sparse grid\n");}
-   if(wgridstride_>0){log.printf("  Grid is written on file %s with stride %d\n",gridfilename_.c_str(),wgridstride_);} 
+    log.printf("  Grid min");
+    for(unsigned i=0;i<gmin.size();++i) log.printf(" %s",gmin[i].c_str() );
+    log.printf("\n");
+    log.printf("  Grid max");
+    for(unsigned i=0;i<gmax.size();++i) log.printf(" %s",gmax[i].c_str() );
+    log.printf("\n");
+    log.printf("  Grid bin");
+    for(unsigned i=0;i<gbin.size();++i) log.printf(" %u",gbin[i]);
+    log.printf("\n");
+    if(spline){log.printf("  Grid uses spline interpolation\n");}
+    if(sparsegrid){log.printf("  Grid uses sparse grid\n");}
+    if(wgridstride_>0){log.printf("  Grid is written on file %s with stride %d\n",gridfilename_.c_str(),wgridstride_);} 
   }
-  if(gridreadfilename_.length()>0){
-	   log.printf("  Reading an additional bias from grid in file %s \n",gridreadfilename_.c_str());
-  }
-
 
   if(mw_n_>1){
-   if(walkers_mpi) error("MPI version of multiple walkers is not compatible with filesystem version of multiple walkers");
-   log.printf("  %d multiple walkers active\n",mw_n_);
-   log.printf("  walker id %d\n",mw_id_);
-   log.printf("  reading stride %d\n",mw_rstride_);
-   log.printf("  directory with hills files %s\n",mw_dir_.c_str());
+    if(walkers_mpi) error("MPI version of multiple walkers is not compatible with filesystem version of multiple walkers");
+    log.printf("  %d multiple walkers active\n",mw_n_);
+    log.printf("  walker id %d\n",mw_id_);
+    log.printf("  reading stride %d\n",mw_rstride_);
+    log.printf("  directory with hills files %s\n",mw_dir_.c_str());
   } else {
-   if(walkers_mpi) log.printf("  Multiple walkers active using MPI communnication\n"); 
+    if(walkers_mpi) log.printf("  Multiple walkers active using MPI communnication\n"); 
   }
 
   addComponent("bias"); componentIsNotPeriodic("bias");
   if( rewf_grid_.size()>0 ){ 
-   addComponent("rbias"); componentIsNotPeriodic("rbias");
-   addComponent("rct"); componentIsNotPeriodic("rct"); 
+    addComponent("rbias"); componentIsNotPeriodic("rbias");
+    addComponent("rct"); componentIsNotPeriodic("rct"); 
+    log.printf("  the c(t) reweighting factor will be calculated every %u hills\n",rewf_ustride_);
+    getPntrToComponent("rct")->set(reweight_factor);
   }
   addComponent("work"); componentIsNotPeriodic("work");
 
@@ -663,98 +733,122 @@ last_step_warn_grid(0)
     addComponent("acc"); componentIsNotPeriodic("acc");
   }
 
-// for performance
+  // for performance
   dp_ = new double[getNumberOfArguments()];
 
-// initializing and checking grid
-  if(grid_){
-   // check for adaptive and sigma_min
-   if(sigma0min_.size()==0&&adaptive_!=FlexibleBin::none) error("When using Adaptive Gaussians on a grid SIGMA_MIN must be specified");
-   // check for mesh and sigma size
-   for(unsigned i=0;i<getNumberOfArguments();i++) {
-     double a,b;
-     Tools::convert(gmin[i],a);
-     Tools::convert(gmax[i],b);
-     double mesh=(b-a)/((double)gbin[i]);
-     if(mesh>0.5*sigma0_[i]) log<<"  WARNING: Using a METAD with a Grid Spacing larger than half of the Gaussians width can produce artifacts\n";
-   }
-   std::string funcl=getLabel() + ".bias";
-   if(!sparsegrid){BiasGrid_=new Grid(funcl,getArguments(),gmin,gmax,gbin,spline,true);}
-   else{BiasGrid_=new SparseGrid(funcl,getArguments(),gmin,gmax,gbin,spline,true);}
-   std::vector<std::string> actualmin=BiasGrid_->getMin();
-   std::vector<std::string> actualmax=BiasGrid_->getMax();
-   for(unsigned i=0;i<getNumberOfArguments();i++){
-     if(gmin[i]!=actualmin[i]) log<<"  WARNING: GRID_MIN["<<i<<"] has been adjusted to "<<actualmin[i]<<" to fit periodicity\n";
-     if(gmax[i]!=actualmax[i]) log<<"  WARNING: GRID_MAX["<<i<<"] has been adjusted to "<<actualmax[i]<<" to fit periodicity\n";
-   }
-  }
-
-  if(wgridstride_>0){
-    gridfile_.link(*this);
-    gridfile_.open(gridfilename_);
-  }
-
-// initializing external grid
+  // restart from external grid
+  bool restartedFromGrid=false;
   if(gridreadfilename_.length()>0){
-   hasextgrid_=true;
-   // read the grid in input, find the keys
-   IFile gridfile; gridfile.open(gridreadfilename_);
-   std::string funcl=getLabel() + ".bias";
-   ExtGrid_=Grid::create(funcl,getArguments(),gridfile,false,false,true);
-   gridfile.close();
-   if(ExtGrid_->getDimension()!=getNumberOfArguments()) error("mismatch between dimensionality of input grid and number of arguments");
-   for(unsigned i=0;i<getNumberOfArguments();++i){
-     if( getPntrToArgument(i)->isPeriodic()!=ExtGrid_->getIsPeriodic()[i] ) error("periodicity mismatch between arguments and input bias");
-   }
+    // read the grid in input, find the keys
+    IFile gridfile; 
+    gridfile.link(*this);
+    if(gridfile.FileExist(gridreadfilename_)){
+      gridfile.open(gridreadfilename_);
+    } else {
+      error("The GRID file you want to read: " + gridreadfilename_ + ", cannot be found!");
+    }
+    std::string funcl=getLabel() + ".bias";
+    BiasGrid_=Grid::create(funcl, getArguments(), gridfile, gmin, gmax, gbin, sparsegrid, spline, true);
+    gridfile.close();
+    if(BiasGrid_->getDimension()!=getNumberOfArguments()) error("mismatch between dimensionality of input grid and number of arguments");
+    for(unsigned i=0;i<getNumberOfArguments();++i){
+      if( getPntrToArgument(i)->isPeriodic()!=BiasGrid_->getIsPeriodic()[i] ) error("periodicity mismatch between arguments and input bias");
+      double a, b;
+      Tools::convert(gmin[i],a);
+      Tools::convert(gmax[i],b);
+      double mesh=(b-a)/((double)gbin[i]);
+      if(mesh>0.5*sigma0_[i]) log<<"  WARNING: Using a METAD with a Grid Spacing larger than half of the Gaussians width can produce artifacts\n";
+    }
+    log.printf("  Restarting from %s:",gridreadfilename_.c_str());                  
+    if(getRestart()) restartedFromGrid=true;
   }
 
-// Tiwary-Parrinello reweighting factor
-  if(rewf_grid_.size()>0){
-   log.printf("  the c(t) reweighting factor will be calculated every %u hills\n",rewf_ustride_);
-   getPntrToComponent("rct")->set(reweight_factor);
+  // initializing and checking grid
+  if(grid_&&!(gridreadfilename_.length()>0)){
+    // check for adaptive and sigma_min
+    if(sigma0min_.size()==0&&adaptive_!=FlexibleBin::none) error("When using Adaptive Gaussians on a grid SIGMA_MIN must be specified");
+    // check for mesh and sigma size
+    for(unsigned i=0;i<getNumberOfArguments();i++) {
+      double a,b;
+      Tools::convert(gmin[i],a);
+      Tools::convert(gmax[i],b);
+      double mesh=(b-a)/((double)gbin[i]);
+      if(mesh>0.5*sigma0_[i]) log<<"  WARNING: Using a METAD with a Grid Spacing larger than half of the Gaussians width can produce artifacts\n";
+    }
+    std::string funcl=getLabel() + ".bias";
+    if(!sparsegrid){BiasGrid_=new Grid(funcl,getArguments(),gmin,gmax,gbin,spline,true);}
+    else{BiasGrid_=new SparseGrid(funcl,getArguments(),gmin,gmax,gbin,spline,true);}
+    std::vector<std::string> actualmin=BiasGrid_->getMin();
+    std::vector<std::string> actualmax=BiasGrid_->getMax();
+    for(unsigned i=0;i<getNumberOfArguments();i++){
+      if(gmin[i]!=actualmin[i]) log<<"  WARNING: GRID_MIN["<<i<<"] has been adjusted to "<<actualmin[i]<<" to fit periodicity\n";
+      if(gmax[i]!=actualmax[i]) log<<"  WARNING: GRID_MAX["<<i<<"] has been adjusted to "<<actualmax[i]<<" to fit periodicity\n";
+    }
   }
 
-// creating vector of ifile* for hills reading 
-// open all files at the beginning and read Gaussians if restarting
+  // creating vector of ifile* for hills reading 
+  // open all files at the beginning and read Gaussians if restarting
   for(int i=0;i<mw_n_;++i){
-   string fname;
-   if(mw_n_>1) {
-    stringstream out; out << i;
-    fname = mw_dir_+"/"+hillsfname+"."+out.str();
-   } else {
-    fname = hillsfname;
-   }
-   IFile *ifile = new IFile();
-   ifile->link(*this);
-   ifiles.push_back(ifile);                                                             
-   ifilesnames.push_back(fname);
-   if(ifile->FileExist(fname)){
-    ifile->open(fname);
-    if(getRestart()){
-     log.printf("  Restarting from %s:",ifilesnames[i].c_str());                  
-     readGaussians(ifiles[i]);                                                    
+    string fname;
+    if(mw_n_>1) {
+      stringstream out; out << i;
+      fname = mw_dir_+"/"+hillsfname+"."+out.str();
+    } else {
+      fname = hillsfname;
+    }
+    IFile *ifile = new IFile();
+    ifile->link(*this);
+    ifiles.push_back(ifile);                                                             
+    ifilesnames.push_back(fname);
+    if(ifile->FileExist(fname)){
+      ifile->open(fname);
+      if(getRestart()&&!restartedFromGrid){
+        log.printf("  Restarting from %s:",ifilesnames[i].c_str());                  
+        readGaussians(ifiles[i]);                                                    
+      }
+      ifiles[i]->reset(false);
+      // close only the walker own hills file for later writing
+      if(i==mw_id_) ifiles[i]->close();
     }
-    ifiles[i]->reset(false);
-    // close only the walker own hills file for later writing
-    if(i==mw_id_) ifiles[i]->close();
-   }
   }
 
   comm.Barrier();
-
-// this barrier is needed when using walkers_mpi
-// to be sure that all files have been read before
-// backing them up
-// it should not be used when walkers_mpi is false otherwise
-// it would introduce troubles when using replicas without METAD
-// (e.g. in bias exchange with a neutral replica)
-// see issue #168 on github
+  // this barrier is needed when using walkers_mpi
+  // to be sure that all files have been read before
+  // backing them up
+  // it should not be used when walkers_mpi is false otherwise
+  // it would introduce troubles when using replicas without METAD
+  // (e.g. in bias exchange with a neutral replica)
+  // see issue #168 on github
   if(comm.Get_rank()==0 && walkers_mpi) multi_sim_comm.Barrier();
+  if(targetfilename_.length()>0){
+    IFile gridfile; gridfile.open(targetfilename_);
+    std::string funcl=getLabel() + ".target";
+    TargetGrid_=Grid::create(funcl,getArguments(),gridfile,false,false,true);
+    gridfile.close();
+    if(TargetGrid_->getDimension()!=getNumberOfArguments()) error("mismatch between dimensionality of input grid and number of arguments");
+    for(unsigned i=0;i<getNumberOfArguments();++i){
+      if( getPntrToArgument(i)->isPeriodic()!=TargetGrid_->getIsPeriodic()[i] ) error("periodicity mismatch between arguments and input bias");
+    }
+  }
+
+  // Calculate the Tiwary-Parrinello reweighting factor if we are restarting from previous hills
+  if(getRestart() && rewf_grid_.size()>0 ) computeReweightingFactor();
 
-// Calculate the Tiwary-Parrinello reweighting factor if we are restarting from previous hills
-  if(plumed.getRestart() && rewf_grid_.size()>0 ){computeReweightingFactor();}
+  // open grid file for writing
+  if(wgridstride_>0){
+    gridfile_.link(*this);
+    if(walkers_mpi){
+      int r=0;
+      if(comm.Get_rank()==0) r=multi_sim_comm.Get_rank();
+      comm.Bcast(r,0);
+      if(r>0) gridfilename_="/dev/null";
+      gridfile_.enforceSuffix("");
+    }
+    gridfile_.open(gridfilename_);
+  }
 
-// open hills file for writing
+  // open hills file for writing
   hillsOfile_.link(*this);
   if(walkers_mpi){
     int r=0;
@@ -771,11 +865,10 @@ last_step_warn_grid(0)
     hillsOfile_.addConstantField("upper_int").printField("upper_int",uppI_);
   }
   hillsOfile_.setHeavyFlush();
-// output periodicities of variables
+  // output periodicities of variables
   for(unsigned i=0;i<getNumberOfArguments();++i) hillsOfile_.setupPrintValue( getPntrToArgument(i) );
 
   bool concurrent=false;
-
   const ActionSet&actionSet(plumed.getActionSet());
   for(ActionSet::const_iterator p=actionSet.begin();p!=actionSet.end();++p) if(dynamic_cast<MetaD*>(*p)){ concurrent=true; break; }
   if(concurrent) log<<"  You are using concurrent metadynamics\n";
@@ -795,55 +888,61 @@ last_step_warn_grid(0)
      "Pratyush and Parrinello, J. Phys. Chem. B, 119, 736 (2015)");
   if(concurrent) log<<plumed.cite(
      "Gil-Ley and Bussi, J. Chem. Theory Comput. 11, 1077 (2015)");
+  if(targetfilename_.length()>0){
+    log<<plumed.cite("White, Dama, and Voth, J. Chem. Theory Comput. 11, 2451 (2015)");
+    log<<plumed.cite("Marinelli and Faraldo-Gómez,  Biophys. J. 108, 2779 (2015)");
+    log<<plumed.cite("Gil-Ley, Bottaro, and Bussi, submitted (2016)");
+  }
   log<<"\n";
 
 }
 
 void MetaD::readGaussians(IFile *ifile)
 {
- unsigned ncv=getNumberOfArguments();
- vector<double> center(ncv);
- vector<double> sigma(ncv);
- double height;
- int nhills=0; 
- bool multivariate=false;
-
- std::vector<Value> tmpvalues;
- for(unsigned j=0;j<getNumberOfArguments();++j) tmpvalues.push_back( Value( this, getPntrToArgument(j)->getName(), false ) ); 
-
- while(scanOneHill(ifile,tmpvalues,center,sigma,height,multivariate)){;
-  nhills++;
-  if(welltemp_){height*=(biasf_-1.0)/biasf_;}
-  addGaussian(Gaussian(center,sigma,height,multivariate));
- }     
- log.printf("      %d Gaussians read\n",nhills);
+  unsigned ncv=getNumberOfArguments();
+  vector<double> center(ncv);
+  vector<double> sigma(ncv);
+  double height;
+  int nhills=0; 
+  bool multivariate=false;
+ 
+  std::vector<Value> tmpvalues;
+  for(unsigned j=0;j<getNumberOfArguments();++j) tmpvalues.push_back( Value( this, getPntrToArgument(j)->getName(), false ) ); 
+ 
+  while(scanOneHill(ifile,tmpvalues,center,sigma,height,multivariate)){;
+    nhills++;
+    if(welltemp_){height*=(biasf_-1.0)/biasf_;}
+    addGaussian(Gaussian(center,sigma,height,multivariate));
+  }     
+  log.printf("      %d Gaussians read\n",nhills);
 }
 
 bool MetaD::readChunkOfGaussians(IFile *ifile, unsigned n)
 {
- unsigned ncv=getNumberOfArguments();
- vector<double> center(ncv);
- vector<double> sigma(ncv);
- double height;
- unsigned nhills=0;
- bool multivariate=false;
- std::vector<Value> tmpvalues;
- for(unsigned j=0;j<getNumberOfArguments();++j) tmpvalues.push_back( Value( this, getPntrToArgument(j)->getName(), false ) ); 
-
- while(scanOneHill(ifile,tmpvalues,center,sigma,height,multivariate)){;
-  if(welltemp_){height*=(biasf_-1.0)/biasf_;}
-  addGaussian(Gaussian(center,sigma,height,multivariate));
-  if(nhills==n){
+  unsigned ncv=getNumberOfArguments();
+  vector<double> center(ncv);
+  vector<double> sigma(ncv);
+  double height;
+  unsigned nhills=0;
+  bool multivariate=false;
+  std::vector<Value> tmpvalues;
+  for(unsigned j=0;j<getNumberOfArguments();++j) tmpvalues.push_back( Value( this, getPntrToArgument(j)->getName(), false ) ); 
+ 
+  while(scanOneHill(ifile,tmpvalues,center,sigma,height,multivariate)){;
+    if(welltemp_) height*=(biasf_-1.0)/biasf_; 
+    addGaussian(Gaussian(center,sigma,height,multivariate));
+    if(nhills==n){
       log.printf("      %u Gaussians read\n",nhills);
       return true;
-  }
-  nhills++;
- }     
- log.printf("      %u Gaussians read\n",nhills);
- return false;
+    }
+    nhills++;
+  }     
+  log.printf("      %u Gaussians read\n",nhills);
+  return false;
 }
 
-void MetaD::writeGaussian(const Gaussian& hill, OFile&file){
+void MetaD::writeGaussian(const Gaussian& hill, OFile&file)
+{
   unsigned ncv=getNumberOfArguments();
   file.printField("time",getTimeStep()*getStep());
   for(unsigned i=0;i<ncv;++i){
@@ -851,190 +950,183 @@ void MetaD::writeGaussian(const Gaussian& hill, OFile&file){
   }
   if(hill.multivariate){
     hillsOfile_.printField("multivariate","true");
-         Matrix<double> mymatrix(ncv,ncv);
-         unsigned k=0;
-         for(unsigned i=0;i<ncv;i++){
-                for(unsigned j=i;j<ncv;j++){
-                        mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
-                        k++;
-                }
-         }
-         // invert it 
-         Matrix<double> invmatrix(ncv,ncv);
-         Invert(mymatrix,invmatrix);
-         // enforce symmetry
-         for(unsigned i=0;i<ncv;i++){
-                for(unsigned j=i;j<ncv;j++){
-                        invmatrix(i,j)=invmatrix(j,i);
-                }
-         }
-
-         // do cholesky so to have a "sigma like" number
-         Matrix<double> lower(ncv,ncv);
-         cholesky(invmatrix,lower); // now this , in band form , is similar to the sigmas
-         // loop in band form 
-         for (unsigned i=0;i<ncv;i++){
-              for (unsigned j=0;j<ncv-i;j++){
-                      file.printField("sigma_"+getPntrToArgument(j+i)->getName()+"_"+getPntrToArgument(j)->getName(),lower(j+i,j));
-              }
-         }
+    Matrix<double> mymatrix(ncv,ncv);
+    unsigned k=0;
+    for(unsigned i=0;i<ncv;i++){
+      for(unsigned j=i;j<ncv;j++){
+        // recompose the full inverse matrix
+        mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k];
+        k++;
+      }
+    }
+    // invert it 
+    Matrix<double> invmatrix(ncv,ncv);
+    Invert(mymatrix,invmatrix);
+    // enforce symmetry
+    for(unsigned i=0;i<ncv;i++){
+      for(unsigned j=i;j<ncv;j++){
+        invmatrix(i,j)=invmatrix(j,i);
+      }
+    }
+
+    // do cholesky so to have a "sigma like" number
+    Matrix<double> lower(ncv,ncv);
+    cholesky(invmatrix,lower);
+    // loop in band form 
+    for(unsigned i=0;i<ncv;i++){
+      for(unsigned j=0;j<ncv-i;j++){
+        file.printField("sigma_"+getPntrToArgument(j+i)->getName()+"_"+getPntrToArgument(j)->getName(),lower(j+i,j));
+      }
+    }
   } else {
     hillsOfile_.printField("multivariate","false");
     for(unsigned i=0;i<ncv;++i)
       file.printField("sigma_"+getPntrToArgument(i)->getName(),hill.sigma[i]);
   }
   double height=hill.height;
-  if(welltemp_){height*=biasf_/(biasf_-1.0);}
+  if(welltemp_) height*=biasf_/(biasf_-1.0); 
   file.printField("height",height).printField("biasf",biasf_);
   if(mw_n_>1) file.printField("clock",int(std::time(0)));
   file.printField();
 }
 
-void MetaD::addGaussian(const Gaussian& hill){
-
- if(!grid_){hills_.push_back(hill);} 
- else{
-  unsigned ncv=getNumberOfArguments();
-  vector<unsigned> nneighb=getGaussianSupport(hill);
-  vector<Grid::index_t> neighbors=BiasGrid_->getNeighbors(hill.center,nneighb);
-  vector<double> der(ncv);
-  vector<double> xx(ncv);
-  if(comm.Get_size()==1){
-    for(unsigned i=0;i<neighbors.size();++i){
-     Grid::index_t ineigh=neighbors[i];
-     for(unsigned j=0;j<ncv;++j){der[j]=0.0;}
-     BiasGrid_->getPoint(ineigh,xx);
-     double bias=evaluateGaussian(xx,hill,&der[0]);
-     BiasGrid_->addValueAndDerivatives(ineigh,bias,der);
-    } 
-  } else {
-    unsigned stride=comm.Get_size();
-    unsigned rank=comm.Get_rank();
-    vector<double> allder(ncv*neighbors.size(),0.0);
-    vector<double> allbias(neighbors.size(),0.0);
-    for(unsigned i=rank;i<neighbors.size();i+=stride){
-     Grid::index_t ineigh=neighbors[i];
-     BiasGrid_->getPoint(ineigh,xx);
-     allbias[i]=evaluateGaussian(xx,hill,&allder[ncv*i]);
-    }
-    comm.Sum(allbias);
-    comm.Sum(allder);
-    for(unsigned i=0;i<neighbors.size();++i){
-     Grid::index_t ineigh=neighbors[i];
-     for(unsigned j=0;j<ncv;++j){der[j]=allder[ncv*i+j];}
-     BiasGrid_->addValueAndDerivatives(ineigh,allbias[i],der);
+void MetaD::addGaussian(const Gaussian& hill)
+{
+  if(!grid_) hills_.push_back(hill);  
+  else {
+    unsigned ncv=getNumberOfArguments();
+    vector<unsigned> nneighb=getGaussianSupport(hill);
+    vector<Grid::index_t> neighbors=BiasGrid_->getNeighbors(hill.center,nneighb);
+    vector<double> der(ncv);
+    vector<double> xx(ncv);
+    if(comm.Get_size()==1){
+      for(unsigned i=0;i<neighbors.size();++i){
+        Grid::index_t ineigh=neighbors[i];
+        for(unsigned j=0;j<ncv;++j) der[j]=0.0; 
+        BiasGrid_->getPoint(ineigh,xx);
+        double bias=evaluateGaussian(xx,hill,&der[0]);
+        BiasGrid_->addValueAndDerivatives(ineigh,bias,der);
+      } 
+    } else {
+      unsigned stride=comm.Get_size();
+      unsigned rank=comm.Get_rank();
+      vector<double> allder(ncv*neighbors.size(),0.0);
+      vector<double> allbias(neighbors.size(),0.0);
+      for(unsigned i=rank;i<neighbors.size();i+=stride){
+        Grid::index_t ineigh=neighbors[i];
+        BiasGrid_->getPoint(ineigh,xx);
+        allbias[i]=evaluateGaussian(xx,hill,&allder[ncv*i]);
+      }
+      comm.Sum(allbias);
+      comm.Sum(allder);
+      for(unsigned i=0;i<neighbors.size();++i){
+        Grid::index_t ineigh=neighbors[i];
+        for(unsigned j=0;j<ncv;++j){der[j]=allder[ncv*i+j];}
+        BiasGrid_->addValueAndDerivatives(ineigh,allbias[i],der);
+      }
     }
   }
- }
 }
 
 vector<unsigned> MetaD::getGaussianSupport(const Gaussian& hill)
 {
- vector<unsigned> nneigh;
- if(doInt_){
-   double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[0];
-   if(hill.center[0]+cutoff > uppI_ || hill.center[0]-cutoff < lowI_) { 
-     // in this case, we updated the entire grid to avoid problems
-     return BiasGrid_->getNbin();
-   } else {
-     nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[0])) );
-     return nneigh;
-   }
- }
-
- // traditional or flexible hill? 
- if(hill.multivariate){
-	unsigned ncv=getNumberOfArguments();
-	unsigned k=0;
-	//log<<"------- GET GAUSSIAN SUPPORT --------\n"; 
-	Matrix<double> mymatrix(ncv,ncv);
-	for(unsigned i=0;i<ncv;i++){
-		for(unsigned j=i;j<ncv;j++){
-			mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
-			k++;
-		}
-	}
-        // Reinvert so to have the ellipses 
-	Matrix<double> myinv(ncv,ncv);
-	Invert(mymatrix,myinv);
-	Matrix<double> myautovec(ncv,ncv);
-	vector<double> myautoval(ncv); //should I take this or their square root? 
-	diagMat(myinv,myautoval,myautovec);
-	double maxautoval;maxautoval=0.;
-        unsigned ind_maxautoval;ind_maxautoval=ncv; 
-	for (unsigned i=0;i<ncv;i++){
-		if(myautoval[i]>maxautoval){maxautoval=myautoval[i];ind_maxautoval=i;}
-        }  
-	for (unsigned i=0;i<ncv;i++){
-		double cutoff=sqrt(2.0*DP2CUTOFF)*abs(sqrt(maxautoval)*myautovec(i,ind_maxautoval));
-		//log<<"AUTOVAL "<<myautoval[0]<<" COMP "<<abs(myautoval[0]*myautovec(i,0)) <<" CUTOFF "<<cutoff<<"\n";
-	  	nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[i])) );
-        }
- }else{
-	 for(unsigned i=0;i<getNumberOfArguments();++i){
-	  double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[i];
-	  nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[i])) );
- 	}
- }
-	//log<<"------- END GET GAUSSIAN SUPPORT --------\n"; 
- return nneigh;
+  vector<unsigned> nneigh;
+  if(doInt_){
+    double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[0];
+    if(hill.center[0]+cutoff > uppI_ || hill.center[0]-cutoff < lowI_) { 
+      // in this case, we updated the entire grid to avoid problems
+      return BiasGrid_->getNbin();
+    } else {
+      nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[0])) );
+      return nneigh;
+    }
+  }
+ 
+  // traditional or flexible hill? 
+  if(hill.multivariate){
+    unsigned ncv=getNumberOfArguments();
+    unsigned k=0;
+    Matrix<double> mymatrix(ncv,ncv);
+    for(unsigned i=0;i<ncv;i++){
+      for(unsigned j=i;j<ncv;j++){
+        // recompose the full inverse matrix
+        mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k];
+        k++;
+      }
+    }
+    // Reinvert so to have the ellipses 
+    Matrix<double> myinv(ncv,ncv);
+    Invert(mymatrix,myinv);
+    Matrix<double> myautovec(ncv,ncv);
+    vector<double> myautoval(ncv); //should I take this or their square root? 
+    diagMat(myinv,myautoval,myautovec);
+    double maxautoval=0.;
+    unsigned ind_maxautoval;ind_maxautoval=ncv; 
+    for(unsigned i=0;i<ncv;i++){
+      if(myautoval[i]>maxautoval){maxautoval=myautoval[i];ind_maxautoval=i;}
+    }  
+    for(unsigned i=0;i<ncv;i++){
+      const double cutoff=sqrt(2.0*DP2CUTOFF)*abs(sqrt(maxautoval)*myautovec(i,ind_maxautoval));
+      nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[i])) );
+    }
+  } else {
+    for(unsigned i=0;i<getNumberOfArguments();++i){
+      const double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[i];
+      nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrid_->getDx()[i])) );
+    }
+  }
+  return nneigh;
 }
 
 double MetaD::getBiasAndDerivatives(const vector<double>& cv, double* der)
 {
- double bias=0.0;
- if(!grid_){
-  if(hills_.size()>10000 && (getStep()-last_step_warn_grid)>10000){
-    std::string msg;
-    Tools::convert(hills_.size(),msg);
-    msg="You have accumulated "+msg+" hills, you should enable GRIDs to avoid serious performance hits";
-    warning(msg);
-    last_step_warn_grid=getStep();
-  }
-  unsigned stride=comm.Get_size();
-  unsigned rank=comm.Get_rank();
-  for(unsigned i=rank;i<hills_.size();i+=stride){
-   bias+=evaluateGaussian(cv,hills_[i],der);
-   //finite difference test 
-   //finiteDifferenceGaussian(cv,hills_[i]);
-  }
-  comm.Sum(bias);
-  if(der) comm.Sum(der,getNumberOfArguments());
- }else{
-  if(der){
-   vector<double> vder(getNumberOfArguments());
-   bias=BiasGrid_->getValueAndDerivatives(cv,vder);
-   for(unsigned i=0;i<getNumberOfArguments();++i) {der[i]=vder[i];}
-  }else{
-   bias=BiasGrid_->getValue(cv);
+  double bias=0.0;
+  if(!grid_){
+    if(hills_.size()>10000 && (getStep()-last_step_warn_grid)>10000){
+      std::string msg;
+      Tools::convert(hills_.size(),msg);
+      msg="You have accumulated "+msg+" hills, you should enable GRIDs to avoid serious performance hits";
+      warning(msg);
+      last_step_warn_grid=getStep();
+    }
+    unsigned stride=comm.Get_size();
+    unsigned rank=comm.Get_rank();
+    for(unsigned i=rank;i<hills_.size();i+=stride){
+      bias+=evaluateGaussian(cv,hills_[i],der);
+      //finite difference test 
+      //finiteDifferenceGaussian(cv,hills_[i]);
+    }
+    comm.Sum(bias);
+    if(der) comm.Sum(der,getNumberOfArguments());
+   } else {
+     if(der){
+       vector<double> vder(getNumberOfArguments());
+       bias=BiasGrid_->getValueAndDerivatives(cv,vder);
+       for(unsigned i=0;i<getNumberOfArguments();++i) {der[i]=vder[i];}
+     } else {
+      bias = BiasGrid_->getValue(cv);
+     }
   }
- }
- if(hasextgrid_){
-	  if(der){
-	   vector<double> vder(getNumberOfArguments());
-	   bias+=ExtGrid_->getValueAndDerivatives(cv,vder);
-	   for(unsigned i=0;i<getNumberOfArguments();++i) {der[i]+=vder[i];}
-	  }else{
-	   bias+=ExtGrid_->getValue(cv);
-	  }
- }
- return bias;
+
+  return bias;
 }
 
-double MetaD::getGaussianNormalization( const Gaussian& hill ){
-  double norm=1; unsigned ncv=hill.center.size();
+double MetaD::getGaussianNormalization( const Gaussian& hill )
+{
+  double norm=1; 
+  unsigned ncv=hill.center.size();
 
   if(hill.multivariate){
-  // recompose the full sigma from the upper diag cholesky 
+    // recompose the full sigma from the upper diag cholesky 
     unsigned k=0; 
     Matrix<double> mymatrix(ncv,ncv);
     for(unsigned i=0;i<ncv;i++){
-        for(unsigned j=i;j<ncv;j++){
-            mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
-            k++;
-        }
-        double ldet; logdet( mymatrix, ldet );
-        norm = exp( ldet );  // Not sure here if mymatrix is sigma or inverse
+      for(unsigned j=i;j<ncv;j++){
+        mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
+        k++;
+      }
+      double ldet; logdet( mymatrix, ldet );
+      norm = exp( ldet );  // Not sure here if mymatrix is sigma or inverse
     }
   } else {
     for(unsigned i=0;i<hill.sigma.size();++i) norm*=hill.sigma[i]; 
@@ -1043,120 +1135,128 @@ double MetaD::getGaussianNormalization( const Gaussian& hill ){
   return norm*pow(2*pi,static_cast<double>(ncv)/2.0);
 }
 
-double MetaD::evaluateGaussian
- (const vector<double>& cv, const Gaussian& hill, double* der)
+double MetaD::evaluateGaussian(const vector<double>& cv, const Gaussian& hill, double* der)
 {
- double dp2=0.0;
- double bias=0.0;
-// I use a pointer here because cv is const (and should be const)
-// but when using doInt it is easier to locally replace cv[0] with
-// the upper/lower limit in case it is out of range
- const double *pcv=NULL; // pointer to cv
- double tmpcv[1]; // tmp array with cv (to be used with doInt_)
- if(cv.size()>0) pcv=&cv[0];
- if(doInt_){
-   plumed_assert(cv.size()==1);
-   tmpcv[0]=cv[0];
-   if(cv[0]<lowI_) tmpcv[0]=lowI_;
-   if(cv[0]>uppI_) tmpcv[0]=uppI_;
-   pcv=&(tmpcv[0]);
- }
- if(hill.multivariate){ 
+  double dp2=0.0;
+  double bias=0.0;
+  // I use a pointer here because cv is const (and should be const)
+  // but when using doInt it is easier to locally replace cv[0] with
+  // the upper/lower limit in case it is out of range
+  const double *pcv=NULL; // pointer to cv
+  double tmpcv[1]; // tmp array with cv (to be used with doInt_)
+  if(cv.size()>0) pcv=&cv[0];
+  if(doInt_){
+    plumed_assert(cv.size()==1);
+    tmpcv[0]=cv[0];
+    if(cv[0]<lowI_) tmpcv[0]=lowI_;
+    if(cv[0]>uppI_) tmpcv[0]=uppI_;
+    pcv=&(tmpcv[0]);
+  }
+  if(hill.multivariate){ 
     unsigned k=0;
     unsigned ncv=cv.size(); 
     // recompose the full sigma from the upper diag cholesky 
     Matrix<double> mymatrix(ncv,ncv);
     for(unsigned i=0;i<ncv;i++){
-		for(unsigned j=i;j<ncv;j++){
-			mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
-			k++;
-		}
+      for(unsigned j=i;j<ncv;j++){
+	mymatrix(i,j)=mymatrix(j,i)=hill.sigma[k]; // recompose the full inverse matrix
+	k++;
+      } 
     }
-
     for(unsigned i=0;i<cv.size();++i){
-	double dp_i=difference(i,hill.center[i],pcv[i]);
-        dp_[i]=dp_i;
-    	for(unsigned j=i;j<cv.size();++j){
-                  if(i==j){ 
-              	  	 dp2+=dp_i*dp_i*mymatrix(i,j)*0.5; 
-                  }else{ 
-   		 	 double dp_j=difference(j,hill.center[j],pcv[j]);
-              	  	 dp2+=dp_i*dp_j*mymatrix(i,j) ;
-                  }
+      double dp_i=difference(i,hill.center[i],pcv[i]);
+      dp_[i]=dp_i;
+      for(unsigned j=i;j<cv.size();++j){
+        if(i==j){ 
+      	  dp2+=dp_i*dp_i*mymatrix(i,j)*0.5; 
+        } else { 
+	  double dp_j=difference(j,hill.center[j],pcv[j]);
+      	  dp2+=dp_i*dp_j*mymatrix(i,j);
         }
+      }
     } 
     if(dp2<DP2CUTOFF){
-       bias=hill.height*exp(-dp2);
-       if(der){
+      bias=hill.height*exp(-dp2);
+      if(der){
         for(unsigned i=0;i<cv.size();++i){
-                double tmp=0.0;
-                k=i;
-                for(unsigned j=0;j<cv.size();++j){
-                                tmp+=   dp_[j]*mymatrix(i,j)*bias;
-                        }
-                        der[i]-=tmp;
-                }   
-       }
+          double tmp=0.0;
+          k=i;
+          for(unsigned j=0;j<cv.size();++j){
+            tmp += dp_[j]*mymatrix(i,j)*bias;
+          }
+          der[i]-=tmp;
+        }   
+      }
     }
- }else{
+  } else {
     for(unsigned i=0;i<cv.size();++i){
-     double dp=difference(i,hill.center[i],pcv[i])*hill.invsigma[i];
-     dp2+=dp*dp;
-     dp_[i]=dp;
+      double dp=difference(i,hill.center[i],pcv[i])*hill.invsigma[i];
+      dp2+=dp*dp;
+      dp_[i]=dp;
     }
     dp2*=0.5;
     if(dp2<DP2CUTOFF){
-       bias=hill.height*exp(-dp2);
-       if(der){
+      bias=hill.height*exp(-dp2);
+      if(der){
         for(unsigned i=0;i<cv.size();++i){der[i]+=-bias*dp_[i]*hill.invsigma[i];}
-       }
+      }
     }
- }
- if(doInt_){
-   if((cv[0]<lowI_ || cv[0]>uppI_) && der ) for(unsigned i=0;i<cv.size();++i)der[i]=0;
   }
- return bias;
+
+  if(doInt_ && der){
+    if(cv[0]<lowI_ || cv[0]>uppI_) for(unsigned i=0;i<cv.size();++i) der[i]=0;
+  }
+
+  return bias;
 }
 
 double MetaD::getHeight(const vector<double>& cv)
 {
- double height=height0_;
- if(welltemp_){
-    double vbias=getBiasAndDerivatives(cv);
-    height=height0_*exp(-vbias/(kbt_*(biasf_-1.0)));
- } 
- return height;
+  double height=height0_;
+  if(welltemp_){
+    double vbias = getBiasAndDerivatives(cv);
+    height = height0_*exp(-vbias/(kbt_*(biasf_-1.0)));
+  } 
+  if(dampfactor_>0.0){
+    plumed_assert(BiasGrid_);
+    double m=BiasGrid_->getMaxValue();
+    height*=exp(-m/(kbt_*(dampfactor_)));
+  }
+  if(TargetGrid_){
+    double f=TargetGrid_->getValue(cv)-TargetGrid_->getMaxValue();
+    height*=exp(f/kbt_);
+  }
+  return height;
 }
 
 void MetaD::calculate()
 {
-
-// this is because presently there is no way to properly pass information
-// on adaptive hills (diff) after exchanges:
+  // this is because presently there is no way to properly pass information
+  // on adaptive hills (diff) after exchanges:
   if(adaptive_==FlexibleBin::diffusion && getExchangeStep()) error("ADAPTIVE=DIFF is not compatible with replica exchange");
 
-  unsigned ncv=getNumberOfArguments();
+  const unsigned ncv=getNumberOfArguments();
   vector<double> cv(ncv);
-  for(unsigned i=0;i<ncv;++i){cv[i]=getArgument(i);}
-
-  double* der=new double[ncv];
-  for(unsigned i=0;i<ncv;++i){der[i]=0.0;}
-  double ene=getBiasAndDerivatives(cv,der);
+  double* der = new double[ncv];
+  for(unsigned i=0;i<ncv;++i) {
+    cv[i]=getArgument(i);
+    der[i]=0.;
+  }
+  const double ene = getBiasAndDerivatives(cv,der);
   getPntrToComponent("bias")->set(ene);
   if( rewf_grid_.size()>0 ) getPntrToComponent("rbias")->set(ene - reweight_factor);
-// calculate the acceleration factor
+  // calculate the acceleration factor
   if(acceleration&&!isFirstStep) {
     acc += exp(ene/(kbt_));
-    double mean_acc = acc/((double) getStep());
+    const double mean_acc = acc/((double) getStep());
     getPntrToComponent("acc")->set(mean_acc);
   }
   getPntrToComponent("work")->set(work_);
-// set Forces 
+  // set Forces 
   for(unsigned i=0;i<ncv;++i){
-   const double f=-der[i];
-   setOutputForce(i,f);
+    const double f=-der[i];
+    setOutputForce(i,f);
   }
-
   delete [] der;
 }
 
@@ -1167,76 +1267,78 @@ void MetaD::update(){
 
   // adding hills criteria (could be more complex though)
   bool nowAddAHill; 
-  if(getStep()%stride_==0 && !isFirstStep ){nowAddAHill=true;}else{nowAddAHill=false;isFirstStep=false;}
+  if(getStep()%stride_==0 && !isFirstStep )nowAddAHill=true;
+  else {
+    nowAddAHill=false;
+    isFirstStep=false;
+  }
 
-  for(unsigned i=0;i<cv.size();++i){cv[i]=getArgument(i);}
+  for(unsigned i=0;i<cv.size();++i) cv[i] = getArgument(i);
 
   double vbias=getBiasAndDerivatives(cv);
 
   // if you use adaptive, call the FlexibleBin 
-  if (adaptive_!=FlexibleBin::none){
-       flexbin->update(nowAddAHill);
-       multivariate=true;
-  }else{
-       multivariate=false;
-  };
+  if(adaptive_!=FlexibleBin::none){
+    flexbin->update(nowAddAHill);
+    multivariate=true;
+  } else {
+    multivariate=false;
+  }
 
-  if(nowAddAHill){ // probably this can be substituted with a signal
-   // add a Gaussian
-   double height=getHeight(cv);
-   // use normal sigma or matrix form? 
-   if (adaptive_!=FlexibleBin::none){	
-	thissigma=flexbin->getInverseMatrix(); // returns upper diagonal inverse 
-	//cerr<<"ADDING HILLS "<<endl;	
-   }else{
-	thissigma=sigma0_;    // returns normal sigma
-   }
-// In case we use walkers_mpi, it is now necessary to communicate with other replicas.
-   if(walkers_mpi){
-     int nw=0;
-     int mw=0;
-     if(comm.Get_rank()==0){
-// Only root of group can communicate with other walkers
-       nw=multi_sim_comm.Get_size();
-       mw=multi_sim_comm.Get_rank();
-     }
-// Communicate to the other members of the same group
-// info abount number of walkers and walker index
-     comm.Bcast(nw,0);
-     comm.Bcast(mw,0);
-// Allocate arrays to store all walkers hills
-     std::vector<double> all_cv(nw*cv.size(),0.0);
-     std::vector<double> all_sigma(nw*thissigma.size(),0.0);
-     std::vector<double> all_height(nw,0.0);
-     std::vector<int>    all_multivariate(nw,0);
-     if(comm.Get_rank()==0){
-// Communicate (only root)
-       multi_sim_comm.Allgather(cv,all_cv);
-       multi_sim_comm.Allgather(thissigma,all_sigma);
-       multi_sim_comm.Allgather(height,all_height);
-       multi_sim_comm.Allgather(int(multivariate),all_multivariate);
-     }
-// Share info with group members
-     comm.Bcast(all_cv,0);
-     comm.Bcast(all_sigma,0);
-     comm.Bcast(all_height,0);
-     comm.Bcast(all_multivariate,0);
-     for(int i=0;i<nw;i++){
-// actually add hills one by one
-       std::vector<double> cv_now(cv.size());
-       std::vector<double> sigma_now(thissigma.size());
-       for(unsigned j=0;j<cv.size();j++) cv_now[j]=all_cv[i*cv.size()+j];
-       for(unsigned j=0;j<thissigma.size();j++) sigma_now[j]=all_sigma[i*thissigma.size()+j];
-       Gaussian newhill=Gaussian(cv_now,sigma_now,all_height[i],all_multivariate[i]);
-       addGaussian(newhill);
-       writeGaussian(newhill,hillsOfile_);
-     }
-   } else {
-     Gaussian newhill=Gaussian(cv,thissigma,height,multivariate);
-     addGaussian(newhill);
-// print on HILLS file
-     writeGaussian(newhill,hillsOfile_);
-   }
+  if(nowAddAHill){
+    // add a Gaussian
+    double height=getHeight(cv);
+    // returns upper diagonal inverse 
+    if(adaptive_!=FlexibleBin::none) thissigma=flexbin->getInverseMatrix();
+    // returns normal sigma
+    else thissigma=sigma0_;
+
+    // In case we use walkers_mpi, it is now necessary to communicate with other replicas.
+    if(walkers_mpi){
+      int nw=0;
+      int mw=0;
+      if(comm.Get_rank()==0){
+        // Only root of group can communicate with other walkers
+        nw=multi_sim_comm.Get_size();
+        mw=multi_sim_comm.Get_rank();
+      }
+      // Communicate to the other members of the same group
+      // info abount number of walkers and walker index
+      comm.Bcast(nw,0);
+      comm.Bcast(mw,0);
+      // Allocate arrays to store all walkers hills
+      std::vector<double> all_cv(nw*cv.size(),0.0);
+      std::vector<double> all_sigma(nw*thissigma.size(),0.0);
+      std::vector<double> all_height(nw,0.0);
+      std::vector<int>    all_multivariate(nw,0);
+      if(comm.Get_rank()==0){
+        // Communicate (only root)
+        multi_sim_comm.Allgather(cv,all_cv);
+        multi_sim_comm.Allgather(thissigma,all_sigma);
+        multi_sim_comm.Allgather(height,all_height);
+        multi_sim_comm.Allgather(int(multivariate),all_multivariate);
+      }
+      // Share info with group members
+      comm.Bcast(all_cv,0);
+      comm.Bcast(all_sigma,0);
+      comm.Bcast(all_height,0);
+      comm.Bcast(all_multivariate,0);
+      for(int i=0;i<nw;i++){
+        // actually add hills one by one
+        std::vector<double> cv_now(cv.size());
+        std::vector<double> sigma_now(thissigma.size());
+        for(unsigned j=0;j<cv.size();j++) cv_now[j]=all_cv[i*cv.size()+j];
+        for(unsigned j=0;j<thissigma.size();j++) sigma_now[j]=all_sigma[i*thissigma.size()+j];
+        Gaussian newhill=Gaussian(cv_now,sigma_now,all_height[i],all_multivariate[i]);
+        addGaussian(newhill);
+        writeGaussian(newhill,hillsOfile_);
+      }
+    } else {
+      Gaussian newhill=Gaussian(cv,thissigma,height,multivariate);
+      addGaussian(newhill);
+      // print on HILLS file
+      writeGaussian(newhill,hillsOfile_);
+    }
   }
 
   double vbias1=getBiasAndDerivatives(cv);
@@ -1261,167 +1363,159 @@ void MetaD::update(){
   }
 
   // if multiple walkers and time to read Gaussians
- if(mw_n_>1 && getStep()%mw_rstride_==0){
-   for(int i=0;i<mw_n_;++i){
-    // don't read your own Gaussians
-    if(i==mw_id_) continue;
-    // if the file is not open yet 
-    if(!(ifiles[i]->isOpen())){
-     // check if it exists now and open it!
-     if(ifiles[i]->FileExist(ifilesnames[i])) {
-       ifiles[i]->open(ifilesnames[i]);
-       ifiles[i]->reset(false);
-     }
-    // otherwise read the new Gaussians 
-    } else {
-     log.printf("  Reading hills from %s:",ifilesnames[i].c_str());
-     readGaussians(ifiles[i]);
-     ifiles[i]->reset(false);
+  if(mw_n_>1 && getStep()%mw_rstride_==0){
+    for(int i=0;i<mw_n_;++i){
+      // don't read your own Gaussians
+      if(i==mw_id_) continue;
+      // if the file is not open yet 
+      if(!(ifiles[i]->isOpen())){
+        // check if it exists now and open it!
+        if(ifiles[i]->FileExist(ifilesnames[i])) {
+          ifiles[i]->open(ifilesnames[i]);
+          ifiles[i]->reset(false);
+        }
+        // otherwise read the new Gaussians 
+      } else {
+        log.printf("  Reading hills from %s:",ifilesnames[i].c_str());
+        readGaussians(ifiles[i]);
+        ifiles[i]->reset(false);
+      }
     }
-   }
- } 
-
- if(getStep()%(stride_*rewf_ustride_)==0 && nowAddAHill && rewf_grid_.size()>0 ){computeReweightingFactor();}
+  } 
+  if(getStep()%(stride_*rewf_ustride_)==0 && nowAddAHill && rewf_grid_.size()>0 ) computeReweightingFactor();
 }
 
-void MetaD::finiteDifferenceGaussian
- (const vector<double>& cv, const Gaussian& hill)
+void MetaD::finiteDifferenceGaussian(const vector<double>& cv, const Gaussian& hill)
 {
- log<<"--------- finiteDifferenceGaussian: size "<<cv.size() <<"------------\n";
- // for each cv
- // first get the bias and the derivative
- vector<double> oldder(cv.size()); 
- vector<double> der(cv.size()); 
- vector<double> mycv(cv.size()); 
- mycv=cv; 
- double step=1.e-6;
- Random random; 
- // just displace a tiny bit
- for(unsigned i=0;i<cv.size();i++)log<<"CV "<<i<<" V "<<mycv[i]<<"\n";
- for(unsigned i=0;i<cv.size();i++)mycv[i]+=1.e-2*2*(random.RandU01()-0.5);
- for(unsigned i=0;i<cv.size();i++)log<<"NENEWWCV "<<i<<" V "<<mycv[i]<<"\n";
- double oldbias=evaluateGaussian(mycv,hill,&oldder[0]);
- for (unsigned i=0;i<mycv.size();i++){
-               double delta=step*2*(random.RandU01()-0.5);
-               mycv[i]+=delta;
-               double newbias=evaluateGaussian(mycv,hill,&der[0]);		
-               log<<"CV "<<i;
-               log<<" ANAL "<<oldder[i]<<" NUM "<<(newbias-oldbias)/delta<<" DIFF "<<(oldder[i]-(newbias-oldbias)/delta)<<"\n";
-               mycv[i]-=delta;
- }
- log<<"--------- END finiteDifferenceGaussian ------------\n";
+  log<<"--------- finiteDifferenceGaussian: size "<<cv.size() <<"------------\n";
+  // for each cv
+  // first get the bias and the derivative
+  vector<double> oldder(cv.size()); 
+  vector<double> der(cv.size()); 
+  vector<double> mycv(cv.size()); 
+  mycv=cv; 
+  double step=1.e-6;
+  Random random; 
+  // just displace a tiny bit
+  for(unsigned i=0;i<cv.size();i++) log<<"CV "<<i<<" V "<<mycv[i]<<"\n";
+  for(unsigned i=0;i<cv.size();i++) mycv[i]+=1.e-2*2*(random.RandU01()-0.5);
+  for(unsigned i=0;i<cv.size();i++) log<<"NENEWWCV "<<i<<" V "<<mycv[i]<<"\n";
+  double oldbias=evaluateGaussian(mycv,hill,&oldder[0]);
+  for(unsigned i=0;i<mycv.size();i++){
+    double delta=step*2*(random.RandU01()-0.5);
+    mycv[i]+=delta;
+    double newbias=evaluateGaussian(mycv,hill,&der[0]);		
+    log<<"CV "<<i;
+    log<<" ANAL "<<oldder[i]<<" NUM "<<(newbias-oldbias)/delta<<" DIFF "<<(oldder[i]-(newbias-oldbias)/delta)<<"\n";
+    mycv[i]-=delta;
+  }
+  log<<"--------- END finiteDifferenceGaussian ------------\n";
 }
 
 /// takes a pointer to the file and a template string with values v and gives back the next center, sigma and height 
-bool MetaD::scanOneHill(IFile *ifile,  vector<Value> &tmpvalues, vector<double> &center, vector<double>  &sigma, double &height , bool &multivariate  ){
+bool MetaD::scanOneHill(IFile *ifile,  vector<Value> &tmpvalues, vector<double> &center, vector<double>  &sigma, double &height , bool &multivariate)
+{
   double dummy;
   multivariate=false;
   if(ifile->scanField("time",dummy)){
-     unsigned ncv; ncv=tmpvalues.size();
-     for(unsigned i=0;i<ncv;++i){
-       ifile->scanField( &tmpvalues[i] );
-       if( tmpvalues[i].isPeriodic() && ! getPntrToArgument(i)->isPeriodic() ){
+    unsigned ncv; ncv=tmpvalues.size();
+    for(unsigned i=0;i<ncv;++i){
+      ifile->scanField( &tmpvalues[i] );
+      if( tmpvalues[i].isPeriodic() && ! getPntrToArgument(i)->isPeriodic() ){
+        error("in hills file periodicity for variable " + tmpvalues[i].getName() + " does not match periodicity in input");
+      } else if( tmpvalues[i].isPeriodic() ){
+        std::string imin, imax; tmpvalues[i].getDomain( imin, imax );
+        std::string rmin, rmax; getPntrToArgument(i)->getDomain( rmin, rmax );
+        if( imin!=rmin || imax!=rmax ){
           error("in hills file periodicity for variable " + tmpvalues[i].getName() + " does not match periodicity in input");
-       } else if( tmpvalues[i].isPeriodic() ){
-          std::string imin, imax; tmpvalues[i].getDomain( imin, imax );
-          std::string rmin, rmax; getPntrToArgument(i)->getDomain( rmin, rmax );
-          if( imin!=rmin || imax!=rmax ){
-            error("in hills file periodicity for variable " + tmpvalues[i].getName() + " does not match periodicity in input");
-          }
-       }
-       center[i]=tmpvalues[i].get();
-     }
-     // scan for multivariate label: record the actual file position so to eventually rewind 
-     std::string sss;
-     ifile->scanField("multivariate",sss);
-     if(sss=="true") multivariate=true;
-     else if(sss=="false") multivariate=false;
-     else plumed_merror("cannot parse multivariate = "+ sss);
-     if(multivariate){
-        sigma.resize(ncv*(ncv+1)/2);
-        Matrix<double> upper(ncv,ncv);
-        Matrix<double> lower(ncv,ncv);
-   	for (unsigned i=0;i<ncv;i++){
-                 for (unsigned j=0;j<ncv-i;j++){
-                         ifile->scanField("sigma_"+getPntrToArgument(j+i)->getName()+"_"+getPntrToArgument(j)->getName(),lower(j+i,j));
-                         upper(j,j+i)=lower(j+i,j);
-                 }
         }
-        Matrix<double> mymult(ncv,ncv);       
-        Matrix<double> invmatrix(ncv,ncv);       
-	//log<<"Lower \n";
-        //matrixOut(log,lower); 
-	//log<<"Upper \n";
-        //matrixOut(log,upper); 
-        mult(lower,upper,mymult);          
-	//log<<"Mult \n";
-        //matrixOut(log,mymult); 
-        // now invert and get the sigmas
-        Invert(mymult,invmatrix);
-	//log<<"Invert \n";
-        //matrixOut(log,invmatrix); 
-        // put the sigmas in the usual order: upper diagonal (this time in normal form and not in band form) 
-        unsigned k=0;
-   	for (unsigned i=0;i<ncv;i++){
-   	       for (unsigned j=i;j<ncv;j++){
-   	       	sigma[k]=invmatrix(i,j);
-   	       	k++;
-   	       }
-   	}
-     }else{
-     	for(unsigned i=0;i<ncv;++i){
-            ifile->scanField("sigma_"+getPntrToArgument(i)->getName(),sigma[i]);
+      }
+      center[i]=tmpvalues[i].get();
+    }
+    // scan for multivariate label: record the actual file position so to eventually rewind 
+    std::string sss;
+    ifile->scanField("multivariate",sss);
+    if(sss=="true") multivariate=true;
+    else if(sss=="false") multivariate=false;
+    else plumed_merror("cannot parse multivariate = "+ sss);
+    if(multivariate){
+      sigma.resize(ncv*(ncv+1)/2);
+      Matrix<double> upper(ncv,ncv);
+      Matrix<double> lower(ncv,ncv);
+      for(unsigned i=0;i<ncv;i++){
+        for(unsigned j=0;j<ncv-i;j++){
+          ifile->scanField("sigma_"+getPntrToArgument(j+i)->getName()+"_"+getPntrToArgument(j)->getName(),lower(j+i,j));
+          upper(j,j+i)=lower(j+i,j);
+        }
+      }
+      Matrix<double> mymult(ncv,ncv);       
+      Matrix<double> invmatrix(ncv,ncv);       
+      mult(lower,upper,mymult);          
+      // now invert and get the sigmas
+      Invert(mymult,invmatrix);
+      // put the sigmas in the usual order: upper diagonal (this time in normal form and not in band form) 
+      unsigned k=0;
+      for(unsigned i=0;i<ncv;i++){
+        for(unsigned j=i;j<ncv;j++){
+          sigma[k]=invmatrix(i,j);
+          k++;
         }
-     }
-     
-     ifile->scanField("height",height);
-     ifile->scanField("biasf",dummy);
-     if(ifile->FieldExist("clock")) ifile->scanField("clock",dummy);
-     if(ifile->FieldExist("lower_int")) ifile->scanField("lower_int",dummy);
-     if(ifile->FieldExist("upper_int")) ifile->scanField("upper_int",dummy);
-     ifile->scanField();
-     return true;
-  }else{ 
+      }
+    } else {
+      for(unsigned i=0;i<ncv;++i){
+        ifile->scanField("sigma_"+getPntrToArgument(i)->getName(),sigma[i]);
+      }
+    }
+    
+    ifile->scanField("height",height);
+    ifile->scanField("biasf",dummy);
+    if(ifile->FieldExist("clock")) ifile->scanField("clock",dummy);
+    if(ifile->FieldExist("lower_int")) ifile->scanField("lower_int",dummy);
+    if(ifile->FieldExist("upper_int")) ifile->scanField("upper_int",dummy);
+    ifile->scanField();
+    return true;
+  } else { 
     return false; 
-  }; 
+  }
 }
 
-void MetaD::computeReweightingFactor(){
- if( !welltemp_ ) error("cannot compute the c(t) reweighting factors for non well-tempered metadynamics");
+void MetaD::computeReweightingFactor()
+{
+  if( !welltemp_ ) error("cannot compute the c(t) reweighting factors for non well-tempered metadynamics");
 
- // Recover the minimum values for the grid
- unsigned ncv=getNumberOfArguments();
- unsigned ntotgrid=1;
- std::vector<double> dmin( ncv ),dmax( ncv ), grid_spacing( ncv ), vals( ncv ); 
- for(unsigned j=0;j<ncv;++j){
+  // Recover the minimum values for the grid
+  unsigned ncv=getNumberOfArguments();
+  unsigned ntotgrid=1;
+  std::vector<double> dmin( ncv ),dmax( ncv ), grid_spacing( ncv ), vals( ncv ); 
+  for(unsigned j=0;j<ncv;++j){
     Tools::convert( BiasGrid_->getMin()[j], dmin[j] );
     Tools::convert( BiasGrid_->getMax()[j], dmax[j] );
     grid_spacing[j] = ( dmax[j] - dmin[j] ) / static_cast<double>( rewf_grid_[j] );
     if( !getPntrToArgument(j)->isPeriodic() ) dmax[j] += grid_spacing[j]; 
     ntotgrid *= rewf_grid_[j];
- }
+  }
        
- // Now sum over whole grid
- reweight_factor=0.0; double* der=new double[ncv]; std::vector<unsigned> t_index( ncv );
- double sum1=0.0; double sum2=0.0;
- double afactor = biasf_ / (kbt_*(biasf_-1.0)); double afactor2 = 1.0 / (kbt_*(biasf_-1.0));
- unsigned rank=comm.Get_rank(), stride=comm.Get_size(); 
- for(unsigned i=rank;i<ntotgrid;i+=stride){
-     t_index[0]=(i%rewf_grid_[0]);
-     unsigned kk=i;
-     for(unsigned j=1;j<ncv-1;++j){ kk=(kk-t_index[j-1])/rewf_grid_[i-1]; t_index[j]=(kk%rewf_grid_[i]); }
-     if( ncv>=2 ) t_index[ncv-1]=((kk-t_index[ncv-1])/rewf_grid_[ncv-2]);
-      
-     for(unsigned j=0;j<ncv;++j) vals[j]=dmin[j] + t_index[j]*grid_spacing[j]; 
-
-     double currentb=getBiasAndDerivatives(vals,der);
-     sum1 += exp( afactor*currentb );
-     sum2 += exp( afactor2*currentb );
- }
- delete [] der;
- comm.Sum( sum1 ); comm.Sum( sum2 );
- reweight_factor = kbt_ * std::log( sum1/sum2 );
- getPntrToComponent("rct")->set(reweight_factor);
+  // Now sum over whole grid
+  reweight_factor=0.0; double* der=new double[ncv]; std::vector<unsigned> t_index( ncv );
+  double sum1=0.0; double sum2=0.0;
+  double afactor = biasf_ / (kbt_*(biasf_-1.0)); double afactor2 = 1.0 / (kbt_*(biasf_-1.0));
+  unsigned rank=comm.Get_rank(), stride=comm.Get_size(); 
+  for(unsigned i=rank;i<ntotgrid;i+=stride){
+    t_index[0]=(i%rewf_grid_[0]);
+    unsigned kk=i;
+    for(unsigned j=1;j<ncv-1;++j){ kk=(kk-t_index[j-1])/rewf_grid_[i-1]; t_index[j]=(kk%rewf_grid_[i]); }
+    if( ncv>=2 ) t_index[ncv-1]=((kk-t_index[ncv-1])/rewf_grid_[ncv-2]);
+     
+    for(unsigned j=0;j<ncv;++j) vals[j]=dmin[j] + t_index[j]*grid_spacing[j]; 
+
+    double currentb=getBiasAndDerivatives(vals,der);
+    sum1 += exp( afactor*currentb );
+    sum2 += exp( afactor2*currentb );
+  }
+  delete [] der;
+  comm.Sum( sum1 ); comm.Sum( sum2 );
+  reweight_factor = kbt_ * std::log( sum1/sum2 );
+  getPntrToComponent("rct")->set(reweight_factor);
 }
 
 }
diff --git a/src/bias/PBMetaD.cpp b/src/bias/PBMetaD.cpp
index b2d045356465d1f26e8d31b0f00070ec35f8f705..508dbedd3a1c68862d76d4409c67d3abe3ce8ddb 100644
--- a/src/bias/PBMetaD.cpp
+++ b/src/bias/PBMetaD.cpp
@@ -177,8 +177,6 @@ MULTIPLE_WALKERS
 PRINT ARG=d1,d2,pb.bias STRIDE=100 FILE=COLVAR
 \endverbatim
 
-  
->>>>>>> v2.2
 */
 //+ENDPLUMEDOC
 
@@ -195,18 +193,19 @@ private:
   vector<double> sigma0_;
   vector< vector<Gaussian> > hills_;
   vector<OFile*> hillsOfiles_;
-  vector<IFile*> ifiles;
+  vector<OFile*> gridfiles_;
   vector<Grid*> BiasGrids_;
   bool    grid_;
   double  height0_;
   double  biasf_;
   double  kbt_;
   int     stride_;
+  int     wgridstride_;
   bool    welltemp_;
   bool    multiple_w;
   vector<double> uppI_;
   vector<double> lowI_;
-  bool doInt_;
+  vector<bool>  doInt_;
   bool isFirstStep;
 
   void   readGaussians(int iarg, IFile*);
@@ -242,6 +241,9 @@ void PBMetaD::registerKeywords(Keywords& keys){
   keys.add("optional","BIASFACTOR","use well tempered metadynamics with this biasfactor, one for all biases.  Please note you must also specify temp");
   keys.add("optional","TEMP","the system temperature - this is only needed if you are doing well-tempered metadynamics");
   keys.add("optional","TAU","in well tempered metadynamics, sets height to (kb*DeltaT*pace*timestep)/tau");
+  keys.add("optional","GRID_RFILES", "read grid for the bias");
+  keys.add("optional","GRID_WFILES", "dump grid for the bias");
+  keys.add("optional","GRID_WSTRIDE", "frequency for dumping the grid");
   keys.add("optional","GRID_MIN","the lower bounds for the grid");
   keys.add("optional","GRID_MAX","the upper bounds for the grid");
   keys.add("optional","GRID_BIN","the number of bins for the grid");
@@ -259,13 +261,19 @@ PBMetaD::~PBMetaD(){
    hillsOfiles_[i]->close();
    delete hillsOfiles_[i];
   }
+  if(wgridstride_ > 0) {
+   for(unsigned i=0; i<gridfiles_.size(); ++i) {
+    gridfiles_[i]->close();
+    delete gridfiles_[i];
+   }
+  }
 }
 
 PBMetaD::PBMetaD(const ActionOptions& ao):
 PLUMED_BIAS_INIT(ao),
 grid_(false), height0_(std::numeric_limits<double>::max()),
 biasf_(1.0), kbt_(0.0), stride_(0), welltemp_(false),
-multiple_w(false), doInt_(false), isFirstStep(true)
+multiple_w(false), isFirstStep(true)
 {
 
   parse("FMT",fmt);
@@ -307,6 +315,18 @@ multiple_w(false), doInt_(false), isFirstStep(true)
   
   // MPI version
   parseFlag("MULTIPLE_WALKERS",multiple_w);
+
+  // Grid file
+  vector<string> gridfilenames_;
+  parseVector("GRID_WFILES",gridfilenames_);
+  parse("GRID_WSTRIDE",wgridstride_);
+  if (wgridstride_ == 0 && gridfilenames_.size() > 0) {
+    error("frequency with which to output grid not specified use GRID_WSTRIDE");
+  }
+
+  // Read grid
+  vector<string> gridreadfilenames_;
+  parseVector("GRID_RFILES",gridreadfilenames_);
   
   // Grid Stuff
   vector<std::string> gmin(getNumberOfArguments());
@@ -326,10 +346,10 @@ multiple_w(false), doInt_(false), isFirstStep(true)
   if(gbin.size()!=0     && gmin.size()==0) error("If GRID_SPACING is present also GRID_MIN should be present");
   if(gmin.size()!=0){
     if(gbin.size()==0 && gspacing.size()==0){
-        log<<"  Binsize not specified, 1/5 of sigma will be be used\n";
+        log<<"  Binsize not specified, 1/10 of sigma will be be used\n";
         plumed_assert(sigma0_.size()==getNumberOfArguments());
         gspacing.resize(getNumberOfArguments());
-        for(unsigned i=0;i<gspacing.size();i++) gspacing[i]=0.2*sigma0_[i];
+        for(unsigned i=0;i<gspacing.size();i++) gspacing[i]=0.1*sigma0_[i];
     } else if(gspacing.size()!=0 && gbin.size()==0){
       log<<"  The number of bins will be estimated from GRID_SPACING\n";
     } else if(gspacing.size()!=0 && gbin.size()!=0){
@@ -351,16 +371,22 @@ multiple_w(false), doInt_(false), isFirstStep(true)
   parseFlag("GRID_NOSPLINE",nospline);
   bool spline=!nospline;
   if(gbin.size()>0){grid_=true;}
-  
+  if(!grid_&&gridfilenames_.size() > 0) error("To write a grid you need first to define it!");
+  if(!grid_&&gridreadfilenames_.size() > 0) error("To read a grid you need first to define it!");
+
+  doInt_.resize(getNumberOfArguments(),false);  
   // Interval keyword
   parseVector("INTERVAL_MIN",lowI_);
   parseVector("INTERVAL_MAX",uppI_);
   // various checks
   if(lowI_.size()!=uppI_.size()) error("both a lower and an upper limits must be provided with INTERVAL");
   if(lowI_.size()!=0 && lowI_.size()!=getNumberOfArguments()) error("check number of argument of INTERVAL");
-  for(unsigned i=0; i<lowI_.size(); ++i) if(uppI_[i]<lowI_[i]) error("The Upper limit must be greater than the Lower limit!");
-  if(lowI_.size()>0) doInt_=true;
-  
+  for(unsigned i=0; i<lowI_.size(); ++i) {
+    if(uppI_[i]<lowI_[i]) error("The Upper limit must be greater than the Lower limit!");
+    if(getPntrToArgument(i)->isPeriodic()) warning("INTERVAL is not used for periodic variables");
+    else doInt_[i]=true;
+  }
+ 
   checkRead();
 
   log.printf("  Gaussian width ");
@@ -377,7 +403,9 @@ multiple_w(false), doInt_(false), isFirstStep(true)
     log.printf("  KbT %f\n",kbt_);
   }
   if(multiple_w) log.printf("  Multiple walkers active using MPI communnication\n");
-  if(doInt_) log.printf("  Upper and Lower limits boundaries for the bias are activated\n");
+  for(unsigned i=0; i<doInt_.size();i++) {
+    if(doInt_[i]) log.printf("  Upper and Lower limits boundaries for the bias of CV %u are activated\n", i);
+  }
   if(grid_){
    log.printf("  Grid min");
    for(unsigned i=0;i<gmin.size();++i) log.printf(" %s",gmin[i].c_str() );
@@ -390,14 +418,27 @@ multiple_w(false), doInt_(false), isFirstStep(true)
    log.printf("\n");
    if(spline){log.printf("  Grid uses spline interpolation\n");}
    if(sparsegrid){log.printf("  Grid uses sparse grid\n");}
+   if(wgridstride_>0) {
+    for(unsigned i=0; i<gridfilenames_.size(); ++i) {
+     log.printf("  Grid is written on file %s with stride %d\n",gridfilenames_[i].c_str(),wgridstride_);
+    }
+   }
+   if(gridreadfilenames_.size()>0) {
+    for(unsigned i=0; i<gridreadfilenames_.size(); ++i) {
+     log.printf("  Reading bias from grid in file %s \n",gridreadfilenames_[i].c_str());
+    }
+   }
   }
 
-  addComponentWithDerivatives("bias"); componentIsNotPeriodic("bias");
+  addComponent("bias"); componentIsNotPeriodic("bias");
 
-// initializing vector of hills
+  // initializing vector of hills
   hills_.resize(getNumberOfArguments());
 
-// initializing and checking grid
+  // restart from external grid
+  bool restartedFromGrid=false;
+
+  // initializing and checking grid
   if(grid_){
    // check for mesh and sigma size
    for(unsigned i=0;i<getNumberOfArguments();i++) {
@@ -418,50 +459,73 @@ multiple_w(false), doInt_(false), isFirstStep(true)
     gmax_t[0] = gmax[i];
     gbin_t[0] = gbin[i];
     Grid* BiasGrid_;
-    if(!sparsegrid){BiasGrid_=new Grid(funcl,args,gmin_t,gmax_t,gbin_t,spline,true);}
-    else           {BiasGrid_=new SparseGrid(funcl,args,gmin_t,gmax_t,gbin_t,spline,true);}
-    std::vector<std::string> actualmin=BiasGrid_->getMin();
-    std::vector<std::string> actualmax=BiasGrid_->getMax();
-    if(gmin_t[0]!=actualmin[0]) log<<"  WARNING: GRID_MIN["<<i<<"] has been adjusted to "<<actualmin[0]<<" to fit periodicity\n";
-    if(gmax_t[0]!=actualmax[0]) log<<"  WARNING: GRID_MAX["<<i<<"] has been adjusted to "<<actualmax[0]<<" to fit periodicity\n";
+    // Read grid from file
+    if(gridreadfilenames_.size()>0) {
+     IFile gridfile;
+     gridfile.link(*this);
+     if(gridfile.FileExist(gridreadfilenames_[i])){
+      gridfile.open(gridreadfilenames_[i]);
+     } else {
+      error("The GRID file you want to read: " + gridreadfilenames_[i] + ", cannot be found!");
+     }
+     string funcl = getLabel() + ".bias";
+     BiasGrid_ = Grid::create(funcl, args, gridfile, gmin_t, gmax_t, gbin_t, sparsegrid, spline, true);
+     gridfile.close();
+     if(BiasGrid_->getDimension() != args.size()) {
+      error("mismatch between dimensionality of input grid and number of arguments");
+     }
+     if(getPntrToArgument(i)->isPeriodic() != BiasGrid_->getIsPeriodic()[0]) {
+      error("periodicity mismatch between arguments and input bias");
+     }
+     log.printf("  Restarting from %s:",gridreadfilenames_[i].c_str());                  
+     if(getRestart()) restartedFromGrid=true;
+    } else {
+      if(!sparsegrid){BiasGrid_=new Grid(funcl,args,gmin_t,gmax_t,gbin_t,spline,true);}
+      else           {BiasGrid_=new SparseGrid(funcl,args,gmin_t,gmax_t,gbin_t,spline,true);}
+      std::vector<std::string> actualmin=BiasGrid_->getMin();
+      std::vector<std::string> actualmax=BiasGrid_->getMax();
+      if(gmin_t[0]!=actualmin[0]) log<<"  WARNING: GRID_MIN["<<i<<"] has been adjusted to "<<actualmin[0]<<" to fit periodicity\n";
+      if(gmax_t[0]!=actualmax[0]) log<<"  WARNING: GRID_MAX["<<i<<"] has been adjusted to "<<actualmax[0]<<" to fit periodicity\n";
+    }
     BiasGrids_.push_back(BiasGrid_);
    }
   }
 
-// read Gaussians if restarting
-  for(unsigned i=0;i<hillsfname.size();++i){
-   IFile *ifile = new IFile();
-   ifile->link(*this);
-   if(ifile->FileExist(hillsfname[i])){
-    ifile->open(hillsfname[i]);
-    if(getRestart()){
-     log.printf("  Restarting from %s:",hillsfname[i].c_str());                  
-     readGaussians(i, ifile);                                                    
+  // read Gaussians if restarting
+  if (!restartedFromGrid){
+   for(unsigned i=0;i<hillsfname.size();++i){
+    IFile *ifile = new IFile();
+    ifile->link(*this);
+    if(ifile->FileExist(hillsfname[i])){
+     ifile->open(hillsfname[i]);
+     if(getRestart()){
+      log.printf("  Restarting from %s:",hillsfname[i].c_str());                  
+      readGaussians(i, ifile);                                                    
+     }
+     ifile->reset(false);
+     ifile->close();
+     delete ifile;
     }
-    ifile->reset(false);
-    ifile->close();
-    delete ifile;
    }
   }
 
   comm.Barrier();
-
-// this barrier is needed when using walkers_mpi
-// to be sure that all files have been read before
-// backing them up
-// it should not be used when walkers_mpi is false otherwise
-// it would introduce troubles when using replicas without METAD
-// (e.g. in bias exchange with a neutral replica)
-// see issue #168 on github
+  // this barrier is needed when using walkers_mpi
+  // to be sure that all files have been read before
+  // backing them up
+  // it should not be used when walkers_mpi is false otherwise
+  // it would introduce troubles when using replicas without METAD
+  // (e.g. in bias exchange with a neutral replica)
+  // see issue #168 on github
   if(comm.Get_rank()==0 && multiple_w) multi_sim_comm.Barrier();
 
-// open hills files for writing
- for(unsigned i=0;i<hillsfname.size();++i){
-  OFile *ofile = new OFile();
-  ofile->link(*this);
-  string hillsfname_tmp = hillsfname[i];
-  // if multiple walkers, only rank 0 will write to file
-  if(multiple_w){
+  // open hills files for writing
+  for(unsigned i=0;i<hillsfname.size();++i){
+   OFile *ofile = new OFile();
+   ofile->link(*this);
+   string hillsfname_tmp = hillsfname[i];
+   // if multiple walkers, only rank 0 will write to file
+   if(multiple_w){
     int r=0;
     if(comm.Get_rank()==0) r=multi_sim_comm.Get_rank();
     comm.Bcast(r,0);
@@ -471,26 +535,46 @@ multiple_w(false), doInt_(false), isFirstStep(true)
   ofile->open(hillsfname_tmp);
   if(fmt.length()>0) ofile->fmtField(fmt);
   ofile->addConstantField("multivariate");
-  if(doInt_) {
+  if(doInt_[i]) {
     ofile->addConstantField("lower_int").printField("lower_int",lowI_[i]);
     ofile->addConstantField("upper_int").printField("upper_int",uppI_[i]);
+   }
+   ofile->setHeavyFlush();
+   // output periodicities of variables
+   ofile->setupPrintValue( getPntrToArgument(i) );
+   // push back
+   hillsOfiles_.push_back(ofile);
+  }
+
+  // Dump grid to files
+  if(wgridstride_ > 0) {
+   for(unsigned i = 0; i < gridfilenames_.size(); ++i) {
+    OFile *ofile = new OFile();
+    ofile->link(*this);
+    string gridfname_tmp = gridfilenames_[i];
+    if(multiple_w) {
+     int r = 0;
+     if(comm.Get_rank() == 0) {
+      r = multi_sim_comm.Get_rank();
+     }
+     comm.Bcast(r, 0);
+     if(r>0) {
+      gridfname_tmp = "/dev/null";
+     }
+     ofile->enforceSuffix("");
+    }
+    ofile->open(gridfname_tmp);
+    ofile->setHeavyFlush();
+    gridfiles_.push_back(ofile);
+   }
   }
-  ofile->setHeavyFlush();
-  // output periodicities of variables
-  ofile->setupPrintValue( getPntrToArgument(i) );
-  // push back
-  hillsOfiles_.push_back(ofile);
- }
 
   log<<"  Bibliography "<<plumed.cite("Pfaendtner and Bonomi. J. Chem. Theory Comput. 11, 5062 (2015)");
-  if(doInt_) log<<plumed.cite(
+  if(doInt_[0]) log<<plumed.cite(
      "Baftizadeh, Cossio, Pietrucci, and Laio, Curr. Phys. Chem. 2, 79 (2012)");
   if(multiple_w) log<<plumed.cite(
     "Raiteri, Laio, Gervasio, Micheletti, and Parrinello, J. Phys. Chem. B 110, 3533 (2006)");   
- 
   log<<"\n";
-
-  turnOnDerivatives();
 }
 
 void PBMetaD::readGaussians(int iarg, IFile *ifile){
@@ -581,9 +665,8 @@ void PBMetaD::addGaussian(int iarg, const Gaussian& hill){
 
 vector<unsigned> PBMetaD::getGaussianSupport(int iarg, const Gaussian& hill){
  vector<unsigned> nneigh;
- 
- if(doInt_){
-   double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[0];
+ const double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[0];
+ if(doInt_[iarg]){
    if(hill.center[0]+cutoff > uppI_[iarg] || hill.center[0]-cutoff < lowI_[iarg]) { 
      // in this case, we updated the entire grid to avoid problems
      return BiasGrids_[iarg]->getNbin();
@@ -593,7 +676,6 @@ vector<unsigned> PBMetaD::getGaussianSupport(int iarg, const Gaussian& hill){
    }
  }
 
- double cutoff=sqrt(2.0*DP2CUTOFF)*hill.sigma[0];
  nneigh.push_back( static_cast<unsigned>(ceil(cutoff/BiasGrids_[iarg]->getDx()[0])) );
 
  return nneigh;
@@ -619,6 +701,7 @@ double PBMetaD::getBiasAndDerivatives(int iarg, const vector<double>& cv, double
    bias = BiasGrids_[iarg]->getValue(cv);
   }
  }
+
  return bias;
 }
 
@@ -632,7 +715,7 @@ double PBMetaD::evaluateGaussian
  const double *pcv=NULL; // pointer to cv
  double tmpcv[1]; // tmp array with cv (to be used with doInt_)
  if(cv.size()>0) pcv=&cv[0];
- if(doInt_){
+ if(doInt_[iarg]){
    plumed_assert(cv.size()==1);
    tmpcv[0]=cv[0];
    if(cv[0]<lowI_[iarg]) tmpcv[0]=lowI_[iarg];
@@ -645,7 +728,7 @@ double PBMetaD::evaluateGaussian
        bias = hill.height*exp(-dp2);
        if(der){der[0]+= -bias * dp / hill.sigma[0];}
  }
- if(doInt_){
+ if(doInt_[iarg]){
    if((cv[0]<lowI_[iarg] || cv[0]>uppI_[iarg]) && der ) der[0] = 0.0;
  }
  return bias;
@@ -672,7 +755,6 @@ void PBMetaD::calculate()
   for(unsigned i=0; i<getNumberOfArguments(); ++i){
     const double f = - exp(-bias[i]/kbt_) / (ene) * deriv[i];
     setOutputForce(i, f);
-    getPntrToComponent("bias")->addDerivative(i,-f);
   }
   delete [] der;
 
@@ -688,11 +770,9 @@ void PBMetaD::update()
   if(getStep()%stride_==0 && !isFirstStep ){nowAddAHill=true;}else{nowAddAHill=false;isFirstStep=false;}
 
   if(nowAddAHill){
-
    // get all CVs value
    vector<double> cv(getNumberOfArguments());
    for(unsigned i=0; i<getNumberOfArguments(); ++i) cv[i] = getArgument(i);
-
    // get all biases and heights
    vector<double> bias(getNumberOfArguments());
    vector<double> height(getNumberOfArguments());
@@ -711,7 +791,7 @@ void PBMetaD::update()
      height[i] *=  height0_ / norm;
      if(welltemp_) height[i] *= exp(-bias[i]/(kbt_*(biasf_-1.0)));
    }
-   
+
    // Multiple walkers: share hills and add them all
    if(multiple_w){
      int nw = 0;
@@ -760,7 +840,17 @@ void PBMetaD::update()
       writeGaussian(i, newhill, hillsOfiles_[i]);
     } 
    }
- }
+
+   // write grid files
+   if(wgridstride_>0 && getStep()%wgridstride_==0) {
+     for(unsigned i=0; i<gridfiles_.size(); ++i) {
+       gridfiles_[i]->rewind();
+       BiasGrids_[i]->writeToFile(*gridfiles_[i]);
+       gridfiles_[i]->flush();
+     }
+   }
+
+  }
 }
 
 /// takes a pointer to the file and a template string with values v and gives back the next center, sigma and height 
diff --git a/src/colvar/DRMSD.cpp b/src/colvar/DRMSD.cpp
index 7b447bb7272ca9183f497e91f01b3cbe93348a99..5f263646a5a3cc4dba37a93b6a23f9143d465b22 100644
--- a/src/colvar/DRMSD.cpp
+++ b/src/colvar/DRMSD.cpp
@@ -24,6 +24,7 @@
 #include "ActionRegister.h"
 #include "tools/PDB.h"
 #include "reference/DRMSD.h"
+#include "reference/MetricRegister.h"
 #include "core/Atoms.h"
 
 using namespace std;
@@ -68,7 +69,29 @@ position. Only pairs of atoms whose distance in the reference structure is withi
 DRMSD REFERENCE=file.pdb LOWER_CUTOFF=0.1 UPPER_CUTOFF=0.8
 \endverbatim
 
-...
+The following tells plumed to calculate a DRMSD value for a pair of molecules.  
+
+\verbatim
+DRMSD REFERENCE=file.pdb LOWER_CUTOFF=0.1 UPPER_CUTOFF=0.8 TYPE=INTER-RMSD
+\endverbatim
+
+In the input reference file (file.pdb) the atoms in each of the two molecules are separated by a TER
+command as shown below.
+
+\verbatim
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  1.00  1.00      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  1.00  1.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  1.00  1.00      DIA  H
+TER
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  1.00  1.00      DIA  O
+END
+\endverbatim
+
+In this example the INTER-DRMSD type ensures that the set of distances from which the final
+quantity is computed involve one atom from each of the two molecules.  If this is replaced 
+by INTRA-RMSD then only those distances involving pairs of atoms that are both in the same 
+molecule are computed. 
 
 */
 //+ENDPLUMEDOC
@@ -95,6 +118,10 @@ void DRMSD::registerKeywords(Keywords& keys){
   keys.add("compulsory","REFERENCE","a file in pdb format containing the reference structure and the atoms involved in the CV.");
   keys.add("compulsory","LOWER_CUTOFF","only pairs of atoms further than LOWER_CUTOFF are considered in the calculation.");
   keys.add("compulsory","UPPER_CUTOFF","only pairs of atoms closer than UPPER_CUTOFF are considered in the calculation.");
+  keys.add("compulsory","TYPE","DRMSD","what kind of DRMSD would you like to calculate.  You can use either the normal DRMSD involving all the distances between "
+                                       "the atoms in your molecule.  Alternatively, if you have multiple molecules you can use the type INTER-DRMSD "
+                                       "to compute DRMSD values involving only those distances between the atoms at least two molecules or the type INTRA-DRMSD "
+                                       "to compute DRMSD values involving only those distances between atoms in the same molecule");
 }
 
 DRMSD::DRMSD(const ActionOptions&ao):
@@ -110,8 +137,6 @@ PLUMED_COLVAR_INIT(ao), pbc_(true), myvals(1,0), mypack(0,0,myvals)
   parseFlag("NOPBC",nopbc);
   pbc_=!nopbc;
 
-  checkRead();
-
   addValueWithDerivatives(); setNotPeriodic(); 
 
   // read everything in ang and transform to nm if we are not in natural units
@@ -120,10 +145,11 @@ PLUMED_COLVAR_INIT(ao), pbc_(true), myvals(1,0), mypack(0,0,myvals)
       error("missing input file " + reference );
 
   // store target_ distance
-  ReferenceConfigurationOptions ro( "DRMSD" );
-  drmsd_= new PLMD::DRMSD( ro );
+  std::string type; parse("TYPE",type);
+  drmsd_= metricRegister().create<PLMD::DRMSD>( type );
   drmsd_->setBoundsOnDistances( !nopbc, lcutoff, ucutoff );
   drmsd_->set( pdb );
+  checkRead();
 
   std::vector<AtomNumber> atoms; 
   drmsd_->getAtomRequests( atoms );
@@ -147,15 +173,11 @@ DRMSD::~DRMSD(){
 void DRMSD::calculate(){
 
  double drmsd; Tensor virial; mypack.clear();
-
  drmsd=drmsd_->calculate(getPositions(), getPbc(), mypack, false);
 
  setValue(drmsd);
  for(unsigned i=0;i<getNumberOfAtoms();++i) { if( myvals.isActive(3*i) ) setAtomsDerivatives( i, mypack.getAtomDerivative(i) ); }
  setBoxDerivatives( mypack.getBoxDerivatives() );   
-
- // drmsd_->getVirial( virial ); setBoxDerivatives(virial);
-
 }
 
 }
diff --git a/src/config/Makefile b/src/config/Makefile
index 5fb978fe0e229da4c0b712766b786b4f1c5936e5..bdcddde9374f78c189d45469c4e9b9606e361db3 100644
--- a/src/config/Makefile
+++ b/src/config/Makefile
@@ -1,6 +1,8 @@
 
 # include the machine dependent configuration
--include ../../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../../Makefile.conf
+endif
 
 # sed script to edit program name defaults to do nothing
 ifndef program_transform_name
diff --git a/src/generic/Flush.cpp b/src/generic/Flush.cpp
index 66925216988f1ef0197ea22ea897bb156f3f8dac..1f1c7d65a11847d14e3db0dc3b3b9c15e10774de 100644
--- a/src/generic/Flush.cpp
+++ b/src/generic/Flush.cpp
@@ -67,7 +67,8 @@ public:
   }
   static void registerKeywords( Keywords& keys );
   void calculate(){}
-  void apply(){
+  void apply(){}
+  void update(){
     plumed.fflush();
     log.flush();
     const ActionSet & actionSet(plumed.getActionSet());
diff --git a/src/imd/Makefile b/src/imd/Makefile
index 5968953fca5a6bb7920f6adacd8777f44fc17eb6..08656a8db26d3889898aa5f85f8b15f3c6f91953 100644
--- a/src/imd/Makefile
+++ b/src/imd/Makefile
@@ -1,4 +1,7 @@
--include ../../Makefile.conf
+# include the machine dependent configuration
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../../Makefile.conf
+endif
 
 all: libimd.$(SOEXT)
 
diff --git a/src/lib/Makefile b/src/lib/Makefile
index f68a84704e67153b4509d843cb0bc52a5e90eb2e..3b6ff35daa7146fe4eb75981a8832ebe4ed691e1 100644
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@@ -8,7 +8,9 @@ DIRS := $(sort ../config ../wrapper ../main  $(KERNEL_MODULES))
 DIRSLINKS:=$(addsuffix .links,$(DIRS))
 
 # include the machine dependent configuration
--include ../../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../../Makefile.conf
+endif
 
 # if machine dependent configuration has been found:
 ifdef GCCDEP
diff --git a/src/maketools/make.module b/src/maketools/make.module
index bd77c32a3a30c49741947d368dc370920682887e..31771e44c39ed894e5080f5692b954a3d306719f 100644
--- a/src/maketools/make.module
+++ b/src/maketools/make.module
@@ -5,7 +5,9 @@
 # 
 
 # include the machine dependent configuration
--include ../../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../../Makefile.conf
+endif
 
 # if machine dependent configuration has been found:
 ifdef GCCDEP
diff --git a/src/mapping/Path.cpp b/src/mapping/Path.cpp
index 56fac268c68cf4a48367fcdf000cd42375b2e665..00b1596c692d3a2af7949793dc98b081e47a8349 100644
--- a/src/mapping/Path.cpp
+++ b/src/mapping/Path.cpp
@@ -66,12 +66,14 @@ PLUMED_REGISTER_ACTION(Path,"PATH")
 void Path::registerKeywords( Keywords& keys ){
   PathBase::registerKeywords( keys ); keys.remove("PROPERTY");
   keys.addFlag("NOSPATH",false,"do not calculate the spath position");
+  keys.remove("LOWMEM");
 }
 
 Path::Path(const ActionOptions& ao):
 Action(ao),
 PathBase(ao)
 {
+  setLowMemOption( true ); 
   bool nos; parseFlag("NOSPATH",nos);
 
   std::string empty;
diff --git a/src/reference/DRMSD.cpp b/src/reference/DRMSD.cpp
index 94b0650568a4a4cdaba977c206a8c1daca6d3cb1..46da1d24cccce3100c2060536310888f3a00c8ea 100644
--- a/src/reference/DRMSD.cpp
+++ b/src/reference/DRMSD.cpp
@@ -42,14 +42,15 @@ void DRMSD::setBoundsOnDistances( bool dopbc, double lbound, double ubound ){
   lower=lbound; upper=ubound;
 }
 
-void DRMSD::read( const PDB& pdb ){
-  readAtomsFromPDB( pdb );
-
+void DRMSD::readBounds(){
   parseFlag("NOPBC",nopbc);  
   parse("LOWER_CUTOFF",lower,true);
   parse("UPPER_CUTOFF",upper,true);
   setBoundsOnDistances( !nopbc, lower, upper );
-  setup_targets();
+}
+
+void DRMSD::read( const PDB& pdb ){
+  readAtomsFromPDB( pdb ); readBounds(); setup_targets();
 }
 
 void DRMSD::setReferenceAtoms( const std::vector<Vector>& conf, const std::vector<double>& align_in, const std::vector<double>& displace_in ){
diff --git a/src/reference/DRMSD.h b/src/reference/DRMSD.h
index 1747369bcf4d9da3589909d82d0809a008a287c6..634d031626917af789f5059e4c51eda4fa7127c3 100644
--- a/src/reference/DRMSD.h
+++ b/src/reference/DRMSD.h
@@ -31,18 +31,21 @@ namespace PLMD {
 
 class DRMSD : public SingleDomainRMSD {
 private:
-  bool bounds_were_set;
   bool nopbc;
+protected:
+  bool bounds_were_set;
   double lower, upper;
   std::map< std::pair <unsigned,unsigned> , double> targets;
-  void setup_targets();
+/// Read in NOPBC, LOWER_CUTOFF and UPPER_CUTOFF
+  void readBounds();
 public:
   explicit DRMSD( const ReferenceConfigurationOptions& ro );
 /// This sets upper and lower bounds on distances to be used in DRMSD 
   void setBoundsOnDistances( bool dopbc, double lbound=0.0, double ubound=std::numeric_limits<double>::max( ) );
 /// Check that similar comparisons are being performed - perhaps this is needed ask Davide? GAT
 //  void check( ReferenceConfiguration* , ReferenceConfiguration* );
-  void read( const PDB& );
+  virtual void read( const PDB& );
+  virtual void setup_targets();
   void setReferenceAtoms( const std::vector<Vector>& conf, const std::vector<double>& align_in, const std::vector<double>& displace_in );
   double calc( const std::vector<Vector>& pos, const Pbc& pbc, ReferenceValuePack& myder, const bool& squared ) const ;
 };
diff --git a/src/reference/IntermolecularDRMSD.cpp b/src/reference/IntermolecularDRMSD.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..973ba1814d93c1975601ae21933f8f4ca8e0d9d3
--- /dev/null
+++ b/src/reference/IntermolecularDRMSD.cpp
@@ -0,0 +1,68 @@
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   Copyright (c) 2013-2016 The plumed team
+   (see the PEOPLE file at the root of the distribution for a list of names)
+
+   See http://www.plumed.org for more information.
+
+   This file is part of plumed, version 2.
+
+   plumed is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   plumed is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+#include "DRMSD.h"
+#include "MetricRegister.h"
+
+namespace PLMD {
+
+class IntermolecularDRMSD : public DRMSD {
+private:
+  unsigned nblocks;
+  std::vector<unsigned> blocks;
+public:
+  IntermolecularDRMSD( const ReferenceConfigurationOptions& ro );
+  void read( const PDB& pdb );
+  void setup_targets(); 
+};
+
+PLUMED_REGISTER_METRIC(IntermolecularDRMSD,"INTER-DRMSD")
+
+IntermolecularDRMSD::IntermolecularDRMSD( const ReferenceConfigurationOptions& ro ):
+ReferenceConfiguration( ro ),
+DRMSD( ro ),
+nblocks(0)
+{
+}
+
+void IntermolecularDRMSD::read( const PDB& pdb ){
+  readAtomsFromPDB( pdb, true ); nblocks = pdb.getNumberOfAtomBlocks(); blocks.resize( nblocks+1 );
+  if( nblocks==1 ) error("Trying to compute intermolecular rmsd but found no TERs in input PDB");
+  blocks[0]=0; for(unsigned i=0;i<nblocks;++i) blocks[i+1]=pdb.getAtomBlockEnds()[i];
+  readBounds(); setup_targets();
+}
+
+void IntermolecularDRMSD::setup_targets(){
+  plumed_massert( bounds_were_set, "I am missing a call to DRMSD::setBoundsOnDistances");
+
+  for(unsigned i=1;i<nblocks;++i){
+      for(unsigned j=0;j<i;++j){
+          for(unsigned iatom=blocks[i];iatom<blocks[i+1];++iatom){
+              for(unsigned jatom=blocks[j];jatom<blocks[j+1];++jatom){
+                  double distance = delta( getReferencePosition(iatom), getReferencePosition(jatom) ).modulo();
+                  if(distance < upper && distance > lower ) targets[std::make_pair(iatom,jatom)] = distance;
+              }
+          }
+      }
+  }
+}
+
+}
diff --git a/src/reference/IntramolecularDRMSD.cpp b/src/reference/IntramolecularDRMSD.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4beb31e94106176e5a200004df00cf15954cbf1
--- /dev/null
+++ b/src/reference/IntramolecularDRMSD.cpp
@@ -0,0 +1,66 @@
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   Copyright (c) 2013-2016 The plumed team
+   (see the PEOPLE file at the root of the distribution for a list of names)
+
+   See http://www.plumed.org for more information.
+
+   This file is part of plumed, version 2.
+
+   plumed is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   plumed is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+#include "DRMSD.h"
+#include "MetricRegister.h"
+
+namespace PLMD {
+
+class IntramolecularDRMSD : public DRMSD {
+private:
+  unsigned nblocks;
+  std::vector<unsigned> blocks;
+public:
+  IntramolecularDRMSD( const ReferenceConfigurationOptions& ro );
+  void read( const PDB& pdb );
+  void setup_targets(); 
+};
+
+PLUMED_REGISTER_METRIC(IntramolecularDRMSD,"INTRA-DRMSD")
+
+IntramolecularDRMSD::IntramolecularDRMSD( const ReferenceConfigurationOptions& ro ):
+ReferenceConfiguration( ro ),
+DRMSD( ro ),
+nblocks(0)
+{
+}
+
+void IntramolecularDRMSD::read( const PDB& pdb ){
+  readAtomsFromPDB( pdb, true ); nblocks = pdb.getNumberOfAtomBlocks(); blocks.resize( nblocks+1 );
+  if( nblocks==1 ) error("Trying to compute intramolecular rmsd but found no TERs in input PDB");
+  blocks[0]=0; for(unsigned i=0;i<nblocks;++i) blocks[i+1]=pdb.getAtomBlockEnds()[i];
+  readBounds(); setup_targets();
+}
+
+void IntramolecularDRMSD::setup_targets(){
+  plumed_massert( bounds_were_set, "I am missing a call to DRMSD::setBoundsOnDistances");
+
+  for(unsigned i=0;i<nblocks;++i){
+      for(unsigned iatom=blocks[i]+1;iatom<blocks[i+1];++iatom){
+          for(unsigned jatom=blocks[i];jatom<iatom;++jatom){
+              double distance = delta( getReferencePosition(iatom), getReferencePosition(jatom) ).modulo();
+              if(distance < upper && distance > lower ) targets[std::make_pair(iatom,jatom)] = distance;
+          }
+      }
+  }
+}
+
+}
diff --git a/src/reference/ReferenceAtoms.cpp b/src/reference/ReferenceAtoms.cpp
index ba09e0996ab9f2c277d78a0e1c17030e096118b4..09baacacd9109c1f1c33574e71fb064b15083f28 100644
--- a/src/reference/ReferenceAtoms.cpp
+++ b/src/reference/ReferenceAtoms.cpp
@@ -31,8 +31,8 @@ checks_were_disabled(false)
 {
 }
 
-void ReferenceAtoms::readAtomsFromPDB( const PDB& pdb ){
-  if( pdb.getNumberOfAtomBlocks()!=1 ) error("found multi-atom-block pdb format but expecting only one block of atoms");  
+void ReferenceAtoms::readAtomsFromPDB( const PDB& pdb, const bool allowblocks  ){
+  if( !allowblocks && pdb.getNumberOfAtomBlocks()!=1 ) error("found multi-atom-block pdb format but expecting only one block of atoms");  
 
   for(unsigned i=0;i<pdb.size();++i){
      indices.push_back( pdb.getAtomNumbers()[i] ); reference_atoms.push_back( pdb.getPositions()[i] );
diff --git a/src/reference/ReferenceAtoms.h b/src/reference/ReferenceAtoms.h
index 039a77b34681a0b649d54ff7eb895024d23c3cb1..2d73e3330475466e680e1ebe75782f2416591c93 100644
--- a/src/reference/ReferenceAtoms.h
+++ b/src/reference/ReferenceAtoms.h
@@ -64,7 +64,7 @@ private:
   std::vector<unsigned> atom_der_index;
 protected:
 /// Read in the atoms from the pdb file
-  void readAtomsFromPDB( const PDB& );
+  void readAtomsFromPDB( const PDB&, const bool allowblocks=false );
 /// Add atom indices to list
   void setAtomIndices( const std::vector<AtomNumber>& atomnumbers );
 /// Read a list of atoms from the pdb input file
diff --git a/src/tools/Grid.cpp b/src/tools/Grid.cpp
index 7f72263cb4f9cdbe148951161b9e9742a8be381b..a240c3cce5ae8d3f78dccabd61d9b0b6491f8979 100644
--- a/src/tools/Grid.cpp
+++ b/src/tools/Grid.cpp
@@ -173,7 +173,7 @@ Grid::index_t Grid::getIndex(const vector<unsigned> & indices) const {
   if(indices[i]>=nbin_[i]) {
     std::string is;
     Tools::convert(i,is);
-    std::string msg="ERROR: the system is looking for a value outside the grid along the " + is;
+    std::string msg="ERROR: the system is looking for a value outside the grid along the " + is + " ("+getArgNames()[i]+")";
     plumed_merror(msg+" index!");
   }
  index_t index=indices[dimension_-1];
diff --git a/src/tools/LatticeReduction.cpp b/src/tools/LatticeReduction.cpp
index 920521381786c9e0d215b72a7b3b160f1a7dfe7b..487991d3c98351a04179d4c0a88fbb43849572ac 100644
--- a/src/tools/LatticeReduction.cpp
+++ b/src/tools/LatticeReduction.cpp
@@ -28,10 +28,11 @@ namespace PLMD{
 const double epsilon=1e-14;
 
 void LatticeReduction::sort(Vector v[3]){
+  const double onePlusEpsilon=(1.0+epsilon);
   for(int i=0;i<3;i++) for(int j=i+1;j<3;j++) if(modulo2(v[i])>modulo2(v[j])){
     Vector x=v[i]; v[i]=v[j]; v[j]=x;
   }
-  for(int i=0;i<2;i++) plumed_assert(modulo2(v[i])<=modulo2(v[i+1]));
+  for(int i=0;i<2;i++) plumed_assert(modulo2(v[i])<=modulo2(v[i+1])*onePlusEpsilon);
 }
 
 void LatticeReduction::reduce(Vector&a,Vector&b){
diff --git a/src/tools/OFile.cpp b/src/tools/OFile.cpp
index def2c449a50581baff289cd56a34583bc4b19364..e30bdd9e4defd6f92e8bbab07555d5f3e4916ffe 100644
--- a/src/tools/OFile.cpp
+++ b/src/tools/OFile.cpp
@@ -72,7 +72,8 @@ OFile::OFile():
   linked(NULL),
   fieldChanged(false),
   backstring("bck"),
-  enforceRestart_(false)
+  enforceRestart_(false),
+  enforceBackup_(false)
 {
   fmtField();
   buflen=1;
@@ -373,6 +374,7 @@ FileBase& OFile::flush(){
 
 bool OFile::checkRestart()const{
   if(enforceRestart_) return true;
+  else if(enforceBackup_) return false;
   else if(action) return action->getRestart();
   else if(plumed) return plumed->getRestart();
   else return false;
@@ -380,9 +382,15 @@ bool OFile::checkRestart()const{
 
 OFile& OFile::enforceRestart(){
   enforceRestart_=true;
+  enforceBackup_=false;
   return *this;
 }
 
+OFile& OFile::enforceBackup(){
+  enforceBackup_=true;
+  enforceRestart_=false;
+  return *this;
 }
 
 
+}
diff --git a/src/tools/OFile.h b/src/tools/OFile.h
index 5edcc85732731593840650264f4ea69039ab79f0..a5633e76f7d2c4e2236b46107ef71b930919b30a 100644
--- a/src/tools/OFile.h
+++ b/src/tools/OFile.h
@@ -183,6 +183,8 @@ public virtual FileBase{
   bool checkRestart()const;
 /// True if restart behavior should be forced
   bool enforceRestart_;
+/// True if backup behavior (i.e. non restart) should be forced
+  bool enforceBackup_;
 public:
 /// Constructor
   OFile();
@@ -251,9 +253,11 @@ this method can be used to clean the field list.
   OFile&rewind();
 /// Flush a file
   virtual FileBase&flush();
-/// Enforce restart, also if the attached plume object is not restarting.
+/// Enforce restart, also if the attached plumed object is not restarting.
 /// Useful for tests
   OFile&enforceRestart();
+/// Enforce backup, even if the attached plumed object is restarting.
+  OFile&enforceBackup();
 };
 
 /// Write using << syntax
diff --git a/src/vesselbase/ActionWithVessel.cpp b/src/vesselbase/ActionWithVessel.cpp
index f655c2c4f237d0050327f71a244879280aa94082..7fb99c29f0dff700605811cd81222ccec688f3d4 100644
--- a/src/vesselbase/ActionWithVessel.cpp
+++ b/src/vesselbase/ActionWithVessel.cpp
@@ -56,12 +56,12 @@ ActionWithVessel::ActionWithVessel(const ActionOptions&ao):
   noderiv(true),
   actionIsBridged(false),
   nactive_tasks(0),
-  contributorsAreUnlocked(false),
-  mydata(NULL),
-  weightHasDerivatives(false),
   stopwatch(*new Stopwatch),
   dertime_can_be_off(false),
-  dertime(true)
+  dertime(true),
+  contributorsAreUnlocked(false),
+  weightHasDerivatives(false),
+  mydata(NULL)
 {
   maxderivatives=309; parse("MAXDERIVATIVES",maxderivatives);
   if( keywords.exists("SERIAL") ) parseFlag("SERIAL",serial);
diff --git a/src/wrapper/Makefile b/src/wrapper/Makefile
index 9066d363551c13906daed0547a0a2cc33f5b3735..ce03b0b4da7716b534d150455f30547e8f8ddce3 100644
--- a/src/wrapper/Makefile
+++ b/src/wrapper/Makefile
@@ -1,6 +1,8 @@
 
 # include the machine dependent configuration
--include ../../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../../Makefile.conf
+endif
 
 # openmp flags should not be used here
 ifdef CXXFLAGS_NOOPENMP
diff --git a/test/Makefile b/test/Makefile
index 5f83c1126003ea519e464952607aa1ace5d4add0..92d524ce29fd1e0146c860b3d9a67bfd556b37c7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,4 +1,7 @@
--include ../Makefile.conf
+# include the machine dependent configuration
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../Makefile.conf
+endif
 
 SUBDIRS = link
 
diff --git a/user-doc/Installation.txt b/user-doc/Installation.txt
index a63b893481ef78cbacac65a13d571912ec01657e..6b52fde41a58d315f0b603ca900bfb81d8a4315a 100644
--- a/user-doc/Installation.txt
+++ b/user-doc/Installation.txt
@@ -504,6 +504,62 @@ Once ALMOST is installed, PLUMED 2 can then be configured with ALMOST enabled:
 \endverbatim
 with ALMOST_INSTALL_PATH set to the full path to the ALMOST installation folder.
 
+\section installingonacluster Installing PLUMED on a cluster
+
+If you are installing PLUMED on a cluster and you want several users to take advantage of it
+consider the following suggestions.
+
+First of all, we highly recommend using the module file that PLUMED provides to set up the environment.
+Just edit it as necessary to make it suitable for your environment.
+
+Notice that PLUMED can take advantage of many additionaly features if specific libraries are available upon
+compiling it. Install libmatheval first and check if PLUMED `./configure` is detecting it. Libmatheval is a must have with PLUMED.
+If someone uses gromacs, install libxdrfile first and check if PLUMED `./configure` is detecting it.
+PLUMED will be able to write trr/xtc file, simplifying analysis.
+
+Try to patch all MD codes with the `--runtime` option. This will allow independent update of PLUMED and MD codes.
+  Users will be able to combine any of the installed gromacs/amber/etc versions with any of the installed PLUMED versions.
+Notice that it is sometime claimed that statically linked codes are faster. In our experience, this is not true.
+In case you absolutely need a static executable, be ready to face non trivial linking issues. PLUMED is written in C++,
+thus required the appropriate C++ library to be linked, and might require additional libraries (e.g. libmatheval).
+
+Sometime we make small fixes on the patches. For this reason, keep track of which version of PLUMED you used
+to patch each of the MD code. Perhaps you can call the MD code modules with names such as `gromacs/4.6.7p1`,
+`gromacs/4.6.7p2` and write somewhere in the module file which version of PLUMED you used. Alternatively, call them
+something like `gromacs/4.6.7p2.2.0`. In this way, when we report a bug on the mailing list, users will know if the version
+they are using is affected by it.
+
+Usually it is not necessary to install both a MPI and a non-MPI PLUMED version. PLUMED library only calls MPI functions
+when the MD code is compiled with MPI. PLUMED executable calls MPI functions only when it is invoked without `--no-mpi`.
+In many machines it is thus sufficient to run the plumed executable on the login node as
+\verbatim
+> plumed --no-mpi
+\endverbatim
+even though PLUMED was compiled with MPI and the login node does not support MPI.
+The only case where you might need two different PLUMED installation for compute
+and login node is when you are cross compiling.
+
+PLUMED needs to be well optimized to run efficiently.
+If you need a single PLUMED binary to run efficiency on machines with different levels of hardware (e.g.: some
+of your workstations support AVX and some do not), with intel compiler you can use something like
+\verbatim
+./configure CXX=mpicxx CXXFLAGS="-O3 -axSSE2,AVX"
+\endverbatim
+It will take more time to compile but it will allow you to use a single module. Otherwise, you should install two
+PLUMED version with different optimization levels.
+
+Using modules, it is not necessary to make the PLUMED module explicitly dependent on the used library. Imagine a
+scenario where you first installed a module `libmatheval`, then load it while you compile PLUMED. If you
+provide the following option to configure `LDFLAGS="-Wl,-rpath,$LD_LIBRARY_PATH"`, the PLUMED executable and
+library will remember where libmatheval is, without the need to load libmatheval module at runtime.
+Notice that this trick often does not work for fundamental libraries such as C++ and MPI library. As a consequence,
+usually the PLUMED module should load the compiler and MPI modules.
+
+\attention
+In case you found out how to compile PLUMED on some fancy architecture please share your tricks! You can
+either post it in your blog, send it to the mailing list, or ask as to update this paragraph in the manual, we will
+be happy to do so.
+
 \section installinghints Other hints
 
 We here collect a list of suggestions that might be useful on particular
diff --git a/user-doc/Makefile b/user-doc/Makefile
index 604feb26259d1dc97b91fcf4f8b6767c0dc95e62..dfb3524d46c1500def1b803341286a7c43cb07cf 100644
--- a/user-doc/Makefile
+++ b/user-doc/Makefile
@@ -1,5 +1,7 @@
 # include the machine dependent configuration
--include ../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../Makefile.conf
+endif
 
 .PHONY: all clean
 
diff --git a/user-doc/Misc.txt b/user-doc/Misc.txt
index 94ab7cd3935dfb29d543f3bbd51ce97a2e449d3a..f9e6e3284dd70cf738296d230764474c43099104 100644
--- a/user-doc/Misc.txt
+++ b/user-doc/Misc.txt
@@ -105,8 +105,8 @@ For the impatients:
 :let &runtimepath.=','.$PLUMED_VIMPATH
 " This makes autocompletion work in the expected way:
 :set completeopt=longest,menuone
-" This binds the F2 key to toggle short helps
-: nmap <F2> : PHelp<CR>
+" This enables bindings of F2/F3/F4 to plumed specific commands:
+:let plumed_shortcuts=1
 \endverbatim
 - When you open a PLUMED input file, you can enable syntax highlighting with:
 \verbatim
@@ -117,12 +117,14 @@ This will also enable autocompletion. Use `<CTRL-X><CTRL-O>` to autocomplete a w
 \verbatim
 :setlocal foldmethod=syntax
 \endverbatim
-- While in vim normal mode, you can use the button `<F2>` to show in a split window a short
-  help about the action defined in the line where the cursor is. Hitting `<F2>` again you will
+- While editing a plumed input file, you can use command `:PHelp` (or shortcut `<F2>`)
+  to show in a split window a short help about the action defined in the line where the cursor is.
+  Typing `:PHelp` again (or pushing `<F2>`) you will
   close that window. With `<CTRL-W><CTRL-W>` you go back and forth between the two windows.
 - When you open a file starting with `#! FIELDS`, VIM will automatically understand it
   is a PLUMED outpt file (VIM filetype = plumedf) and will color fields and data columns with
-  alternating colors.
+  alternating colors. Typing `:PPlus` and `:PMinus` (or pushing `<F3>` and `<F4>`)
+  you can move a highlighted column.
 
 See below for more detailed instructions.
 
@@ -308,8 +310,8 @@ To make the navigation easier, you can add a shortcut in your .vimrc file. For e
 : nmap <F2> : PHelp<CR>
 \endverbatim
 you should be able to open and close the manual hitting the F2 key.
-We prefer not to set this in the PLUMED syntax file to avoid clashing with other shortcuts you might
-have bound to the F2 key.
+This is done automatically in the PLUMED syntax file if you add `let plumed_shortcuts=1` to your
+vimrc file.
 
 \par Displaying output files
 
@@ -334,6 +336,32 @@ The colors in the columns are consistent with those shown in the FIELD line.
 In the example above, 1, 2, and 3 will be of the same color as A, B, and C respectively.
 This should make it much easier to find which columns correspond to a given quantity.
 
+It is also possible to highlight a specific field of the file. Typing
+\verbatim
+:5PCol
+\endverbatim
+you will highlight the fifth field. Notice that in the `FIELDS` line (the first line of the file)
+the 7th word of the line will be highlighted, which is the one containing the name of the field.
+This allows for easy matching of values shown
+in the file and tags provided in the `FIELDS` line.
+The highlighted column can be moved back and forth using `:PPlus` and `:PMinus`.
+Adding a count to the command will move the highlighted column more. E.g. `:2PPlus` will move
+the column to the right twice.
+
+If you have a long output file, it might be convenient to split it with
+`:split` so that one of the two windows will only show the header. The other window
+can be used to navigate the file.
+
+
+To make the navigation easier, you can add a shortcut in your .vimrc file. For example, adding:
+\verbatim
+: map <F3> :PMinus<CR>
+: map <F4> :PPlus<CR>
+\endverbatim
+you should be able to move the highlight column using F3 and F4 buttons.
+This is done automatically in the PLUMED syntax file if you add `let plumed_shortcuts=1` to your
+vimrc file.
+
 \page includes Including other files 
 
 If, for some reason, you want to spread your PLUMED input over a number of files you can use \subpage INCLUDE as shown below:
diff --git a/user-doc/bibliography.bib b/user-doc/bibliography.bib
index 0869142ea5e23e75c6da0994f74e7336c1a5724d..7925c274af3db66409268d88afb71ae3ede78e30 100644
--- a/user-doc/bibliography.bib
+++ b/user-doc/bibliography.bib
@@ -2197,3 +2197,46 @@ number = {11},
 pages = {5062-5067},
 year = {2015}
 }
+
+@article{marinelli2015ensemble,
+        Author = {Marinelli, Fabrizio and Faraldo-G{\'o}mez, Jos{\'e} D},
+        Date-Added = {2015-07-24 14:22:59 +0000},
+        Date-Modified = {2015-09-27 11:05:20 +0000},
+        Journal = {Biophys. J.},
+        Number = {12},
+        Pages = {2779-2782},
+        Publisher = {Elsevier},
+        Title = {Ensemble-Biased Metadynamics: A Molecular Simulation Method to Sample Experimental Distributions},
+        Volume = {108},
+        Year = {2015}}
+
+@article{white2015designing,
+        Author = {White, Andrew and Dama, James and Voth, Gregory A},
+        Date-Added = {2015-07-24 14:23:23 +0000},
+        Date-Modified = {2015-09-27 11:08:36 +0000},
+        Journal = {J. Chem. Theory Comput.},
+        Number = {6},
+        Pages = {2451--2460},
+        Publisher = {ACS Publications},
+        Title = {Designing Free Energy Surfaces that Match Experimental Data with Metadynamics},
+        Volume = {11},
+        Year = {2015}}
+
+@article{dama2014well,
+        Author = {Dama, James F and Parrinello, Michele and Voth, Gregory A},
+        Journal = {Phys. Rev. Lett.},
+        Number = {24},
+        Pages = {240602},
+        Publisher = {APS},
+        Title = {Well-Tempered Metadynamics Converges Asymptotically},
+        Volume = {112},
+        Year = {2014}}
+
+
+@misc{gil2016empirical,
+  title={Empirical corrections to the Amber RNA force field with Target Metadynamics},
+  author={Gil-Ley, Alejandro and Bussi, Giovanni},
+  note={submitted}
+}
+
+
diff --git a/vim/Makefile b/vim/Makefile
index ad1ec86c1ab5e5de37d6a20fb1872f0babe12c8a..cd92be4a1cc998990a98843df145c7581bf6e941 100644
--- a/vim/Makefile
+++ b/vim/Makefile
@@ -1,5 +1,7 @@
 # include the machine dependent configuration
--include ../Makefile.conf
+ifneq ($(MAKECMDGOALS),clean)
+  -include ../Makefile.conf
+endif
 
 .PHONY: all clean
 
diff --git a/vim/vimsyntax.sh b/vim/vimsyntax.sh
index d33e895eed30bc47130939a599de5365ed608b35..b0d04cf26bf63d2c4b68a742bcaa30bfb8d0501b 100755
--- a/vim/vimsyntax.sh
+++ b/vim/vimsyntax.sh
@@ -18,28 +18,98 @@ fi
 mkdir -p syntax help
 
 cat > syntax/plumedf.vim << \EOF
-syn match plumedFCol2S   /\v\s+/    nextgroup=plumedFCol1 contained
-syn match plumedFCol2    /\v\S+/    nextgroup=plumedFCol2S contained
-syn match plumedFCol1S   /\v\s+/    nextgroup=plumedFCol2 contained
-syn match plumedFCol1    /\v\S+/    nextgroup=plumedFCol1S
-
-syn match plumedNothing  /\v.*/    contained
-syn match plumedSCol2    /\v\S+/    nextgroup=plumedNothing contained
-syn match plumedSCol1S   /\v\s+/    nextgroup=plumedSCol2 contained
-syn match plumedSCol1    /\v\S+/    nextgroup=plumedSCol1S contained
-
-highlight link plumedFCol1 Constant
-highlight link plumedFCol2 Keyword
-highlight link plumedSCol1 Type
-highlight link plumedSCol2 Constant
-
-syntax match   plumedFComment excludenl /\v#.*$/
-highlight link plumedFComment Comment
-
-syntax region plumedFieldLine  matchgroup=plumedFieldLine start=/\v[ \t]*\#\![ \t]+FIELDS[ \t]+/ excludenl end=/$/ contains=plumedFCol1
-syntax region plumedSetLine    matchgroup=plumedSetLine start=/\v[ \t]*\#\![ \t]+SET[ \t]+/ excludenl end=/$/ contains=plumedSCol1
-highlight link plumedFieldLine Type
-highlight link plumedSetLine   Type
+
+if exists("b:current_syntax")
+  finish
+endif
+
+let b:current_syntax="plumedf"
+
+if exists("g:plumed_shortcuts")
+  noremap <buffer> <F3> :PMinus<CR>
+  noremap <buffer> <F4> :PPlus<CR>
+endif
+
+command! -nargs=0 -count=1 PPlus call PlumedMove(<count>)
+command! -nargs=0 -count=1 PMinus call PlumedMove(-<count>)
+command! -nargs=0 -count=0 PCol call PlumedColumn(<count>)
+
+" move highlight by count columns
+" count can be negative
+function! PlumedMove(count)
+  if !exists("b:plumedf_column")
+    let b:plumedf_column=0
+  endif
+  let b:plumedf_column=b:plumedf_column+a:count
+  if(b:plumedf_column<0)
+    let b:plumedf_column=0
+  endif
+  call PlumedColumn(b:plumedf_column)
+endfunction
+
+" highlight column col
+function! PlumedColumn(col)
+ let b:plumedf_column=a:col
+ let s=""
+ let c=0
+ while c<=a:col+2
+   let c=c+1
+   if(c==a:col)
+     let s=s.'X'
+   else
+     let s=s.'.'
+   endif
+ endwhile
+ call PlumedColumnPattern(s)
+endfunction
+
+" show a given highlight pattern
+" I keep this separate to allow future implementation
+" of multiple column highlight
+function! PlumedColumnPattern(col)
+ syntax clear
+ let coll=split(a:col,'\zs') " splits in characters
+ let cmax=len(coll)
+ let c=cmax
+ while c >0
+  let cc=c+1
+  if(cc>cmax)
+    let cc=cmax-1
+  endif
+  execute 'syn match plumedFCol' . c . 'S /\v\s+/             nextgroup=plumedFCol' . cc . ' contained'
+  let contained='contained'
+  if(c==1)
+     let contained=''
+  endif
+  execute 'syn match plumedFCol'  . c . ' /\v\S+/             nextgroup=plumedFCol' . c  . 'S ' . contained
+  if(coll[c-1]=='B' || (coll[c-1]=='.' && (c-1)%2!=0) )
+    let type="Keyword"
+  elseif(coll[c-1]=='A' || (coll[c-1]=='.' && (c-1)%2==0) )
+    let type="Constant"
+  elseif(coll[c-1]=='X')
+    let type="Todo"
+  endif
+  execute 'highlight link plumedFCol' .c. ' ' . type
+  let c -= 1
+ endwhile
+
+ syn match plumedNothing  /\v.*/    contained
+ syn match plumedSCol2    /\v\S+/    nextgroup=plumedNothing contained
+ syn match plumedSCol1S   /\v\s+/    nextgroup=plumedSCol2 contained
+ syn match plumedSCol1    /\v\S+/    nextgroup=plumedSCol1S contained
+ highlight link plumedSCol1 Type
+ highlight link plumedSCol2 Constant
+ syntax match   plumedFComment excludenl /\v#.*$/
+ highlight link plumedFComment Comment
+ syntax region plumedFieldLine  matchgroup=plumedFieldLine start=/\v[ \t]*\#\![ \t]+FIELDS[ \t]+/ excludenl end=/$/ contains=plumedFCol1
+ syntax region plumedSetLine    matchgroup=plumedSetLine start=/\v[ \t]*\#\![ \t]+SET[ \t]+/ excludenl end=/$/ contains=plumedSCol1
+ highlight link plumedFieldLine Type
+ highlight link plumedSetLine   Type
+endfunction
+
+" initialize to column zero
+call PlumedColumn(0)
+
 EOF
 
 
@@ -74,10 +144,16 @@ endif
 
 let b:current_syntax="plumed"
 
-let s:path=expand('<sfile>:p:h') . "/../"
+if exists("g:plumed_shortcuts")
+  noremap  <buffer> <F2> :PHelp<CR>
+  inoremap <buffer> <F2> <Esc>:PHelp<CR>
+endif
+
+" path for plumed plugin
+let s:path=expand('<sfile>:p:h:h')
 
 " All except space and hash are in word
-set iskeyword=33,34,36-126
+setlocal iskeyword=33,34,36-126
 
 " Matching dots, possibly followed by a comment
 " Only dots are part of the match
@@ -86,21 +162,13 @@ highlight link plumedDots Type
 
 let b:plumedActions=[]
 let b:plumedDictionary={}
-let b:plumedFullMan={}
 
 EOF
 for a in $actions ; do
 action_name="${a%%,*}" 
 action_name_=$(echo $action_name | sed s/-/_/g)
 
-echo 'call add(b:plumedActions,{"word":"'"$action_name"'"})'
 
-dictionary='{"word":"LABEL=","menu":"add a label"}'
-
-
-
-#echo "let b:plumedFullMan[\"$action_name\"]=\"$(
-#eval plumed" --no-mpi manual --action $action_name --vim 2>/dev/null | awk '{if(NR>1)printf("%s\\n",$0)}' )"\"
 
 {
 echo "****************************************"
@@ -109,6 +177,7 @@ echo "****************************************"
 plumed --no-mpi manual --action $action_name --vim 2>/dev/null | awk '{if(NR>1) print}'
 } > help/$action_name.txt
 
+dictionary='{"word":"LABEL=","menu":"(label)"}'
 
 for l in $(echo "$a" | sed 's/,/ /g')
 do
@@ -116,37 +185,58 @@ do
   case "$l" in
   (*:LABEL)
 # this is treated differently
-    string=
   ;;
   (flag:*)
     dictionary="$dictionary"'
 {"word":"'${l#flag:}'","menu":"(flag)"}'
-# syntax match   plumedKeywordsDISTANCE "\v<COMPONENTS>" contained
-    string='"\v<'${l#flag:}'>"' ;;
+  ;;
   (numbered:*)
     dictionary="$dictionary"'
 {"word":"'${l#*:}'","menu":"(numbered)"}'
-# syntax match   plumedKeywordsMOVINGRESTRAINT "\v<KAPPA[0-9]*\=[^{ ]*" contained
-    string='"\v<'${l#*:}'[0-9]*\=[^{ #]*"' ;;
+  ;;
   (*:*)
     dictionary="$dictionary"'
-{"word":"'${l#*:}'="}'
-# syntax match   plumedKeywordsMOVINGRESTRAINT "\v<KAPPA[0-9]*\=[^{ ]*" contained
-    string='"\v<'${l#*:}'[0-9]*\=[^{ #]*"' ;;
+{"word":"'${l#*:}'=","menu":"(option)"}'
+  ;;
   esac
-  test -n "$string" && echo "syntax match   plumedKeywords$action_name_ $string contained contains=plumedStringInKeyword"
 done
 
 dictionary="$(
   echo "$dictionary" | sort | tr '\n' ',' | sed 's/,$//'
 )"
-echo "let b:plumedDictionary[\"plumedLine$action_name\"]=[$dictionary]"
+echo "let b:plumedDictionary[\"$action_name\"]=[$dictionary]"
+
+done
+
+cat << \EOF
+function! PlumedDefineSyntax()
+
+  for key in sort(keys(b:plumedDictionary))
+    call add(b:plumedActions,{"word":key})
+  endfor
+
+for a in b:plumedActions
+  let action=a["word"]
+" vim variables cannot contain -
+" we convert it to triple ___
+  let action_=substitute(action,"-","___","g")
+
+  for b in b:plumedDictionary[action]
+    if(b["menu"]=="(numbered)")
+      let string='"\v<' . b["word"] . '[0-9]*\=[^{ #]*"'
+    elseif(b["menu"]=="(option)")
+" this is necessary since word for option is e.g ."RESTART="
+      let string='"\v<' . substitute(b["word"],"=","","") . '[0-9]*\=[^{ #]*"'
+    elseif(b["menu"]=="(flag)")
+      let string='"\v<' . b["word"] . '>"'
+    endif
+    execute 'syntax match   plumedKeywords' . action_ . ' ' . string . ' contained contains=plumedStringInKeyword'
+  endfor
 
-cat << \EOF | sed s/ACTION/$action_name/g | sed s/ACTNAME/$action_name_/g
 " single line, with explicit LABEL
 " matching action at beginning of line, till the end of the line
 " can contain all the keywords associated with this action, plus strings, label, and comments
-syntax region plumedLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*ACTION>/ excludenl end=/$/ contains=plumedComment,plumedKeywordsACTNAME,plumedLabel,plumedStringOneline fold
+execute 'syntax region plumedLine' . action_ . ' matchgroup=plumedAction' . action_ . ' start=/\v^\s*' . action . '>/ excludenl end=/$/ contains=plumedComment,plumedKeywords' . action_ . ',plumedLabel,plumedStringOneline fold'
 " multiple line, with explicit LABEL
 " first row might contain extra words before arriving at the dots
 " thus continuation dots are matched by plumedDots
@@ -154,12 +244,12 @@ syntax region plumedLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*ACTI
 " ends on dots, possibly followed by the same action name and possibly a comment
 " comments and initial dots are not part of the match
 " can contain all the keywords associated with this action, plus strings, label, and comments
-syntax region plumedCLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*ACTION>(.+\.\.\.\s*(#.*)*$)@=/ end=/\v^\s*\.\.\.(\s+ACTION)?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywordsACTNAME,plumedLabel,plumedString,plumedDots fold
+execute 'syntax region plumedCLine' . action_ . ' matchgroup=plumedAction' . action_ . ' start=/\v^\s*' . action . '>(.+\.\.\.\s*(#.*)*$)@=/ end=/\v^\s*\.\.\.(\s+' . action . ')?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywords' . action_ . ',plumedLabel,plumedString,plumedDots fold'
 " single line, with label: syntax
 " matching label followed by action
 " can contain all the keywords associated with this action, plus strings and comments
 " labels are not allwed
-syntax region plumedLLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*[^ #@][^ #]*:\s+ACTION/ excludenl end=/$/ contains=plumedComment,plumedKeywordsACTNAME,plumedStringOneline fold
+execute 'syntax region plumedLLine' . action_ . ' matchgroup=plumedAction' . action_ . ' start=/\v^\s*[^ #@][^ #]*:\s+' . action . '/ excludenl end=/$/ contains=plumedComment,plumedKeywords' . action_ . ',plumedStringOneline fold'
 " multiple line, with label: syntax
 " first row might contain extra words before arriving at the dots
 " thus continuation dots are matched by plumedDots
@@ -167,22 +257,20 @@ syntax region plumedLLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*[^
 " comments and dots are not part of the match
 " ends on dots, possibly followed by the same label and possibly a comment
 " comments and initial dots are not part of the match
-syntax region plumedLCLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*\z([^ #@][^ #]*\:)\s+ACTION>(.+\.\.\.\s*(#.*)*$)@=/ end=/\v^\s*\.\.\.(\s+\z1)?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywordsACTNAME,plumedString,plumedDots fold
+execute 'syntax region plumedLCLine' . action_ . ' matchgroup=plumedAction' . action_ . ' start=/\v^\s*\z([^ #@][^ #]*\:)\s+' . action . '>(.+\.\.\.\s*(#.*)*$)@=/ end=/\v^\s*\.\.\.(\s+\z1)?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywords' . action_ . ',plumedString,plumedDots fold'
 " this is a hack required to match the ACTION when it is in the second line
-syntax match plumedSpecialACTNAME /\v(\.\.\.\s*(#.*)*\_s*)@<=ACTION>/ contained
-highlight link plumedSpecialACTNAME Type
+execute 'syntax match plumedSpecial' . action_ . ' /\v(\.\.\.\s*(#.*)*\_s*)@<=' . action . '>/ contained'
+execute 'highlight link plumedSpecial' . action_ . ' Type'
 " multiple line, with label: syntax
 " here ACTION is on the second line
 " matching label, dots, possibly comments, newline, then action name
 " comments, dots, and action are not part of the match
 " ends on dots possibly followed by the same label and possibly a comment
-syntax region plumedLCLineACTNAME matchgroup=plumedActionACTNAME start=/\v^\s*\z([^ #@][^ #]*\:)\s+(\.\.\.\s*(#.*)*\_s*ACTION)@=/ end=/\v^\s*\.\.\.(\s+\z1)?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywordsACTNAME,plumedString,plumedSpecialACTNAME,plumedDots fold
-highlight link plumedActionACTNAME Type
-highlight link plumedKeywordsACTNAME Statement
-EOF
+execute 'syntax region plumedLCLine' . action_ . ' matchgroup=plumedAction' . action_ . ' start=/\v^\s*\z([^ #@][^ #]*\:)\s+(\.\.\.\s*(#.*)*\_s*' . action . ')@=/ end=/\v^\s*\.\.\.(\s+\z1)?\s*((#.*)*$)@=/ contains=plumedComment,plumedKeywords' . action_ . ',plumedString,plumedSpecial' . action_ . ',plumedDots fold'
+execute 'highlight link plumedAction' . action_ . ' Type'
+execute 'highlight link plumedKeywords' . action_ . ' Statement'
+endfor
 
-done
-cat << \EOF
 " comments and strings last, with highest priority
 syntax region  plumedString start=/\v\{/  end=/\v\}/ contained contains=plumedString fold
 syntax region  plumedStringOneline start=/\v\{/  end=/\v\}/ oneline contained contains=plumedStringOneline fold
@@ -202,6 +290,9 @@ highlight link plumedLabelWrong Error
 syntax region  plumedComment start="\v^\s*ENDPLUMED>" end="\%$" fold
 syntax match   plumedComment excludenl "\v#.*$"
 highlight link plumedComment Comment
+endfunction
+
+call PlumedDefineSyntax()
 
 
 fun! PlumedGuessRegion()
@@ -238,6 +329,10 @@ fun! PlumedContextManual()
       execute 'rightbelow split | view ' name
     endif
     let b:plumed_helpfile=1
+" this is to allow closing the window with F2
+    if exists("g:plumed_shortcuts")
+      noremap  <buffer> <F2> :PHelp<CR>
+    endif
   endif
 endfunction
 
@@ -258,7 +353,7 @@ fun! PlumedComplete(findstart, base)
             " locate the start of the word
             let line = getline('.')
             let start = col('.') - 1
-            while start > 0 && line[start - 1] =~ '[a-zA-Z\_]'
+            while start > 0 && line[start - 1] =~ '[a-zA-Z\_\=\-\.]'
               let start -= 1
             endwhile
             return start
@@ -279,8 +374,9 @@ fun! PlumedComplete(findstart, base)
               endif
             endif
             let comp=[]
-" normalize key removing L/C/LC indication
-            let key1=substitute(key,"^plumed[LC]*Line","plumedLine","")
+" retrieve action name
+" normalize ___ to -
+            let key1=substitute(substitute(key,"^plumed[LC]*Line","",""),"___","-","g")
             if key ==""
 " if outside of any region, complete with list of actions
               let comp=b:plumedActions
@@ -308,6 +404,18 @@ fun! PlumedComplete(findstart, base)
 " so it should be unlet to iterate the loop
               unlet m
             endfor
+"           if("..." =~ '^' . a:base && (key=~"^plumedLLine.*" || key=~"^plumedLine.*"))
+"             call add(res,{"word":"...","menu":"(start multiline statement)"})
+"           endif
+"           if("..." =~ '^' . a:base && (key=~"^plumedLCLine.*" || key=~"^plumedCLine.*") && getline('.')=~'^\s*$')
+"              call add(res,{"word":"...","menu":"(end multiline statement)"})
+"           endif
+            if("#" =~ '^' . a:base && key!="plumedComment") 
+               call add(res,{"word":"#","menu":"(add comment)"})
+            endif
+            if("ENDPLUMED" =~ '^' . a:base && key =="")
+               call add(res,{"word":"ENDPLUMED","menu":"(end input)"})
+            endif
             return res
           endif
         endfun