From 1627d8696a496b49e82f833dae0906897e3c7b13 Mon Sep 17 00:00:00 2001
From: Carlo Camilloni <carlo.camilloni@gmail.com>
Date: Tue, 7 Jul 2015 11:34:47 +0200
Subject: [PATCH] patch: gromacs 5.1-rc1

---
 .../src/gromacs/CMakeLists.txt                |  14 +-
 .../src/gromacs/CMakeLists.txt.preplumed      |  14 +-
 .../src/gromacs/mdlib/minimize.cpp            |  10 +-
 .../src/gromacs/mdlib/minimize.cpp.preplumed  |  10 +-
 .../src/programs/mdrun/md.cpp                 | 712 ++++++++----------
 .../src/programs/mdrun/md.cpp.preplumed       | 698 ++++++++---------
 .../src/programs/mdrun/mdrun.cpp              | 107 +--
 .../src/programs/mdrun/mdrun.cpp.preplumed    | 107 +--
 .../src/programs/mdrun/repl_ex.cpp            |   4 +-
 .../src/programs/mdrun/repl_ex.cpp.preplumed  |   4 +-
 10 files changed, 703 insertions(+), 977 deletions(-)

diff --git a/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt
index 6ce9d1258..1739bd4f9 100644
--- a/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt
+++ b/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt
@@ -100,6 +100,7 @@ add_subdirectory(ewald)
 add_subdirectory(fft)
 add_subdirectory(linearalgebra)
 add_subdirectory(math)
+add_subdirectory(mdrunutility)
 add_subdirectory(random)
 add_subdirectory(onlinehelp)
 add_subdirectory(options)
@@ -167,7 +168,7 @@ if(GMX_USE_GCC44_BUG_WORKAROUND)
    gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
 endif()
 
-if (GMX_GPU)
+if (GMX_GPU AND NOT GMX_USE_OPENCL)
     cuda_add_library(libgromacs ${LIBGROMACS_SOURCES}
             OPTIONS
             RELWITHDEBINFO -g
@@ -204,7 +205,7 @@ target_link_libraries(libgromacs
                       ${TNG_IO_LIBRARIES}
                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
                       ${XML_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${PLUMED_LOAD})
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES}} ${PLUMED_LOAD})
 set_target_properties(libgromacs PROPERTIES
                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
                       SOVERSION ${LIBRARY_SOVERSION_MAJOR}
@@ -229,7 +230,7 @@ if (NOT GMX_BUILD_MDRUN_ONLY)
 endif()
 
 if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU)
+    if (GMX_GPU AND NOT GMX_USE_OPENCL)
         foreach(CUDA_LIB ${CUDA_LIBRARIES})
             string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
             if(IS_CUDART) #libcuda should not be installed
@@ -243,3 +244,10 @@ if (INSTALL_CUDART_LIB) #can be set manual by user
         message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
     endif()
 endif()
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt.preplumed
index 3f426e02a..26d711138 100644
--- a/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt.preplumed
+++ b/patches/gromacs-5.1.0.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -98,6 +98,7 @@ add_subdirectory(ewald)
 add_subdirectory(fft)
 add_subdirectory(linearalgebra)
 add_subdirectory(math)
+add_subdirectory(mdrunutility)
 add_subdirectory(random)
 add_subdirectory(onlinehelp)
 add_subdirectory(options)
@@ -165,7 +166,7 @@ if(GMX_USE_GCC44_BUG_WORKAROUND)
    gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
 endif()
 
-if (GMX_GPU)
+if (GMX_GPU AND NOT GMX_USE_OPENCL)
     cuda_add_library(libgromacs ${LIBGROMACS_SOURCES}
             OPTIONS
             RELWITHDEBINFO -g
@@ -202,7 +203,7 @@ target_link_libraries(libgromacs
                       ${TNG_IO_LIBRARIES}
                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
                       ${XML_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES})
 set_target_properties(libgromacs PROPERTIES
                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
                       SOVERSION ${LIBRARY_SOVERSION_MAJOR}
@@ -227,7 +228,7 @@ if (NOT GMX_BUILD_MDRUN_ONLY)
 endif()
 
 if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU)
+    if (GMX_GPU AND NOT GMX_USE_OPENCL)
         foreach(CUDA_LIB ${CUDA_LIBRARIES})
             string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
             if(IS_CUDART) #libcuda should not be installed
@@ -241,3 +242,10 @@ if (INSTALL_CUDART_LIB) #can be set manual by user
         message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
     endif()
 endif()
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
diff --git a/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp b/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp
index de6c16544..8f754b36c 100644
--- a/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp
+++ b/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp
@@ -717,14 +717,12 @@ static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
                                    t_nrnb *nrnb, gmx_wallcycle_t wcycle)
 {
     /* Repartition the domain decomposition */
-    wallcycle_start(wcycle, ewcDOMDEC);
     dd_partition_system(fplog, step, cr, FALSE, 1,
                         NULL, top_global, ir,
                         &ems->s, &ems->f,
                         mdatoms, top, fr, vsite, NULL, constr,
                         nrnb, wcycle, FALSE);
     dd_store_state(cr->dd, &ems->s);
-    wallcycle_stop(wcycle, ewcDOMDEC);
 }
 
 static void evaluate_energy(FILE *fplog, t_commrec *cr,
@@ -1032,7 +1030,6 @@ double do_cg(FILE *fplog, t_commrec *cr,
              int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
              gmx_membed_t gmx_unused membed,
              real gmx_unused cpt_period, real gmx_unused max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long gmx_unused Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -1659,7 +1656,6 @@ double do_lbfgs(FILE *fplog, t_commrec *cr,
                 int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
                 gmx_membed_t gmx_unused membed,
                 real gmx_unused cpt_period, real gmx_unused max_hours,
-                const char gmx_unused *deviceOptions,
                 int imdport,
                 unsigned long gmx_unused Flags,
                 gmx_walltime_accounting_t walltime_accounting)
@@ -2469,7 +2465,6 @@ double do_steep(FILE *fplog, t_commrec *cr,
                 int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
                 gmx_membed_t gmx_unused membed,
                 real gmx_unused cpt_period, real gmx_unused max_hours,
-                const char  gmx_unused *deviceOptions,
                 int imdport,
                 unsigned long gmx_unused Flags,
                 gmx_walltime_accounting_t walltime_accounting)
@@ -2709,7 +2704,6 @@ double do_nm(FILE *fplog, t_commrec *cr,
              int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
              gmx_membed_t gmx_unused membed,
              real gmx_unused cpt_period, real gmx_unused max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long gmx_unused Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -2761,9 +2755,9 @@ double do_nm(FILE *fplog, t_commrec *cr,
     if (bIsMaster)
     {
         fprintf(stderr,
-                "NOTE: This version of Gromacs has been compiled in single precision,\n"
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
                 "      which MIGHT not be accurate enough for normal mode analysis.\n"
-                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
                 "      are fairly modest even if you recompile in double precision.\n\n");
     }
 #endif
diff --git a/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp.preplumed b/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp.preplumed
index c4bd5c77b..e70ceb8be 100644
--- a/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp.preplumed
+++ b/patches/gromacs-5.1.0.diff/src/gromacs/mdlib/minimize.cpp.preplumed
@@ -673,14 +673,12 @@ static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
                                    t_nrnb *nrnb, gmx_wallcycle_t wcycle)
 {
     /* Repartition the domain decomposition */
-    wallcycle_start(wcycle, ewcDOMDEC);
     dd_partition_system(fplog, step, cr, FALSE, 1,
                         NULL, top_global, ir,
                         &ems->s, &ems->f,
                         mdatoms, top, fr, vsite, NULL, constr,
                         nrnb, wcycle, FALSE);
     dd_store_state(cr->dd, &ems->s);
-    wallcycle_stop(wcycle, ewcDOMDEC);
 }
 
 static void evaluate_energy(FILE *fplog, t_commrec *cr,
@@ -953,7 +951,6 @@ double do_cg(FILE *fplog, t_commrec *cr,
              int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
              gmx_membed_t gmx_unused membed,
              real gmx_unused cpt_period, real gmx_unused max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long gmx_unused Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -1580,7 +1577,6 @@ double do_lbfgs(FILE *fplog, t_commrec *cr,
                 int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
                 gmx_membed_t gmx_unused membed,
                 real gmx_unused cpt_period, real gmx_unused max_hours,
-                const char gmx_unused *deviceOptions,
                 int imdport,
                 unsigned long gmx_unused Flags,
                 gmx_walltime_accounting_t walltime_accounting)
@@ -2390,7 +2386,6 @@ double do_steep(FILE *fplog, t_commrec *cr,
                 int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
                 gmx_membed_t gmx_unused membed,
                 real gmx_unused cpt_period, real gmx_unused max_hours,
-                const char  gmx_unused *deviceOptions,
                 int imdport,
                 unsigned long gmx_unused Flags,
                 gmx_walltime_accounting_t walltime_accounting)
@@ -2630,7 +2625,6 @@ double do_nm(FILE *fplog, t_commrec *cr,
              int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
              gmx_membed_t gmx_unused membed,
              real gmx_unused cpt_period, real gmx_unused max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long gmx_unused Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -2682,9 +2676,9 @@ double do_nm(FILE *fplog, t_commrec *cr,
     if (bIsMaster)
     {
         fprintf(stderr,
-                "NOTE: This version of Gromacs has been compiled in single precision,\n"
+                "NOTE: This version of GROMACS has been compiled in single precision,\n"
                 "      which MIGHT not be accurate enough for normal mode analysis.\n"
-                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
+                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
                 "      are fairly modest even if you recompile in double precision.\n\n");
     }
 #endif
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp b/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp
index c23b2a87b..dcef103cd 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp
@@ -95,7 +95,7 @@
 #include "gromacs/mdlib/compute_io.h"
 #include "gromacs/mdlib/mdrun_signalling.h"
 #include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/pull.h"
@@ -139,7 +139,10 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
     md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                   gmx_step_str(step, sbuf));
 
-    nbnxn_cuda_reset_timings(nbv);
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+    }
 
     wallcycle_stop(wcycle, ewcRUN);
     wallcycle_reset_all(wcycle);
@@ -169,7 +172,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              gmx_edsam_t ed, t_forcerec *fr,
              int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
              real cpt_period, real max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -207,7 +209,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     t_graph          *graph = NULL;
     gmx_signalling_t  gs;
     gmx_groups_t     *groups;
-    gmx_ekindata_t   *ekind, *ekind_save;
+    gmx_ekindata_t   *ekind;
     gmx_shellfc_t     shellfc;
     int               count, nconverged = 0;
     double            tcount                 = 0;
@@ -221,7 +223,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     matrix            lastbox;
     int               lamnew  = 0;
     /* for FEP */
-    int               nstfep;
+    int               nstfep = 0;
     double            cycles;
     real              saved_conserved_quantity = 0;
     real              last_ekin                = 0;
@@ -233,9 +235,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                                                           simulation stops. If equal to zero, don't
                                                                           communicate any more between multisims.*/
     /* PME load balancing data for GPU kernels */
-    pme_load_balancing_t pme_loadbal = NULL;
-    double               cycles_pmes;
-    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+    pme_load_balancing_t *pme_loadbal;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
 
     /* Interactive MD */
     gmx_bool          bIMDstep = FALSE;
@@ -285,19 +287,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
     bGStatEveryStep = (nstglobalcomm == 1);
 
-    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-    {
-        fprintf(fplog,
-                "To reduce the energy communication with nstlist = -1\n"
-                "the neighbor list validity should not be checked at every step,\n"
-                "this means that exact integration is not guaranteed.\n"
-                "The neighbor list validity is checked after:\n"
-                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-                "In most cases this will result in exact integration.\n"
-                "This reduces the energy communication by a factor of 2 to 3.\n"
-                "If you want less energy communication, set nstlist > 3.\n\n");
-    }
-
     if (bRerunMD)
     {
         ir->nstxout_compressed = 0;
@@ -329,9 +318,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* Kinetic energy data */
     snew(ekind, 1);
     init_ekindata(fplog, top_global, &(ir->opts), ekind);
-    /* needed for iteration of constraints */
-    snew(ekind_save, 1);
-    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
     /* Copy the cos acceleration to the groups struct */
     ekind->cosacc.cos_accel = ir->cos_accel;
 
@@ -440,7 +426,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                             state_global, top_global, ir,
                             state, &f, mdatoms, top, fr,
                             vsite, shellfc, constr,
-                            nrnb, wcycle, FALSE);
+                            nrnb, NULL, FALSE);
 
     }
 
@@ -494,26 +480,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                         repl_ex_nst, repl_ex_nex, repl_ex_seed);
     }
 
-    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-     * PME tuning is not supported with PME only for LJ and not for Coulomb.
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
      */
-    if ((Flags & MD_TUNEPME) &&
-        EEL_PME(fr->eeltype) &&
-        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
-        !bRerunMD)
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
     {
-        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-        cycles_pmes = 0;
-        if (cr->duty & DUTY_PME)
-        {
-            /* Start tuning right away, as we can't measure the load */
-            bPMETuneRunning = TRUE;
-        }
-        else
-        {
-            /* Separate PME nodes, we can measure the PP/PME load balance */
-            bPMETuneTry = TRUE;
-        }
+        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
+                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
     }
 
     if (!ir->bContinuation && !bRerunMD)
@@ -550,21 +526,47 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
     debug_gmx();
 
-    /* set free energy calculation frequency as the minimum
-       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-    nstfep = ir->fepvals->nstdhdl;
-    if (ir->bExpanded)
+    if (IR_TWINRANGE(*ir) && repl_ex_nst % ir->nstcalclr != 0)
     {
-        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
+        /* We should exchange at nstcalclr steps to get correct integration */
+        gmx_fatal(FARGS, "The replica exchange period (%d) is not divisible by nstcalclr (%d)", repl_ex_nst, ir->nstcalclr);
     }
-    if (repl_ex_nst > 0)
+
+    if (ir->efep != efepNO)
     {
-        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst.
+         * Check for nstcalclr with twin-range, since we need the long-range
+         * contribution to the free-energy at the correct (nstcalclr) steps.
+         */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            if (IR_TWINRANGE(*ir) &&
+                ir->expandedvals->nstexpanded % ir->nstcalclr != 0)
+            {
+                gmx_fatal(FARGS, "nstexpanded should be divisible by nstcalclr");
+            }
+        }
+        if (repl_ex_nst > 0)
+        {
+            nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        }
+        /* We checked divisibility of repl_ex_nst and nstcalclr above */
+        if (IR_TWINRANGE(*ir) && nstfep % ir->nstcalclr != 0)
+        {
+            gmx_incons("nstfep not divisible by nstcalclr");
+        }
     }
 
-    /* I'm assuming we need global communication the first time! MRS */
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
     cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bStopCM ? CGLO_STOPCM : 0)
                   | (bVV ? CGLO_PRESSURE : 0)
                   | (bVV ? CGLO_CONSTRAINT : 0)
                   | (bRerunMD ? CGLO_RERUNMD : 0)
@@ -804,6 +806,20 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     while (!bLastStep || (bRerunMD && bNotLastFrame))
     {
 
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state, wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
         wallcycle_start(wcycle, ewcSTEP);
 
         if (bRerunMD)
@@ -835,7 +851,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
             set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
             bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
                             && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
         }
@@ -918,9 +934,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         else
         {
             /* Determine whether or not to do Neighbour Searching and LR */
-            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-
-            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP);
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
         }
 
         /* check whether we should stop because another simulation has
@@ -986,25 +1000,22 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
 
             if (DOMAINDECOMP(cr))
+
+                /* PLUMED */
+                if(plumedswitch){
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+                }
+                /* END PLUMED */
             {
                 /* Repartition the domain decomposition */
-                wallcycle_start(wcycle, ewcDOMDEC);
                 dd_partition_system(fplog, step, cr,
                                     bMasterState, nstglobalcomm,
                                     state_global, top_global, ir,
                                     state, &f, mdatoms, top, fr,
                                     vsite, shellfc, constr,
                                     nrnb, wcycle,
-                                    do_verbose && !bPMETuneRunning);
-                wallcycle_stop(wcycle, ewcDOMDEC);
-                /* If using an iterative integrator, reallocate space to match the decomposition */
-
-                /* PLUMED */
-                if(plumedswitch){
-                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-                }
-                /* END PLUMED */
+                                    do_verbose && !bPMETunePrinting);
             }
         }
 
@@ -1209,96 +1220,92 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                           ekind, M, upd, bInitStep, etrtVELOCITY1,
                           cr, nrnb, constr, &top->idef);
 
-            /* TODO remove the brace below, once iteration is removed */
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
             {
-                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-                {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    update_constraints(fplog, step, NULL, ir, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, shake_vir,
-                                       cr, nrnb, wcycle, upd, constr,
-                                       TRUE, bCalcVir);
-                    wallcycle_start(wcycle, ewcUPDATE);
-                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                    {
-                        /* Correct the virial for multiple time stepping */
-                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                    }
-                }
-                else if (graph)
-                {
-                    /* Need to unshift here if a do_force has been
-                       called in the previous step */
-                    unshift_self(graph, state->box, state->x);
-                }
-                /* if VV, compute the pressure and constraints */
-                /* For VV2, we strictly only need this if using pressure
-                 * control, but we really would like to have accurate pressures
-                 * printed out.
-                 * Think about ways around this in the future?
-                 * For now, keep this choice in comments.
-                 */
-                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-                bPres = TRUE;
-                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-                if (bCalcEner && ir->eI == eiVVAK)
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
                 {
-                    bSumEkinhOld = TRUE;
+                    /* Correct the virial for multiple time stepping */
+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
                 }
-                /* for vv, the first half of the integration actually corresponds to the previous step.
-                   So we need information from the last step in the first half of the integration */
-                if (bGStat || do_per_step(step-1, nstglobalcomm))
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, state->x);
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, state->box,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
                 {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, state->box,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags
-                                    | CGLO_ENERGY
-                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-                                    | (bPres ? CGLO_PRESSURE : 0)
-                                    | (bPres ? CGLO_CONSTRAINT : 0)
-                                    | CGLO_SCALEEKIN
-                                    );
-                    /* explanation of above:
-                       a) We compute Ekin at the full time step
-                       if 1) we are using the AveVel Ekin, and it's not the
-                       initial step, or 2) if we are using AveEkin, but need the full
-                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-                       EkinAveVel because it's needed for the pressure */
-                    wallcycle_start(wcycle, ewcUPDATE);
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
                 }
-                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-                if (!bInitStep)
+                else
                 {
-                    if (bTrotter)
+                    if (bExchanged)
                     {
-                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-                    }
-                    else
-                    {
-                        if (bExchanged)
-                        {
-                            wallcycle_stop(wcycle, ewcUPDATE);
-                            /* We need the kinetic energy at minus the half step for determining
-                             * the full step kinetic energy and possibly for T-coupling.*/
-                            /* This may not be quite working correctly yet . . . . */
-                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-                                            constr, NULL, FALSE, state->box,
-                                            top_global, &bSumEkinhOld,
-                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-                            wallcycle_start(wcycle, ewcUPDATE);
-                        }
+                        wallcycle_stop(wcycle, ewcUPDATE);
+                        /* We need the kinetic energy at minus the half step for determining
+                         * the full step kinetic energy and possibly for T-coupling.*/
+                        /* This may not be quite working correctly yet . . . . */
+                        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                        wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                        constr, NULL, FALSE, state->box,
+                                        top_global, &bSumEkinhOld,
+                                        CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        wallcycle_start(wcycle, ewcUPDATE);
                     }
                 }
             }
-            /* TODO remove the brace above, once iteration is removed */
             if (bTrotter && !bInitStep)
             {
                 copy_mat(shake_vir, state->svir_prev);
@@ -1462,63 +1469,95 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 }
             }
         }
-        /* TODO remove the brace below, once iteration is removed */
+        /* #########   START SECOND UPDATE STEP ################# */
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
         {
-            /* #########   START SECOND UPDATE STEP ################# */
-            /* Box is changed in update() when we do pressure coupling,
-             * but we should still use the old box for energy corrections and when
-             * writing it to the energy file, so it matches the trajectory files for
-             * the same timestep above. Make a copy in a separate array.
-             */
-            copy_mat(state->box, lastbox);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+            }
 
-            dvdl_constr = 0;
+            if (bVV)
+            {
+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                              ekind, M, upd, FALSE, etrtVELOCITY2,
+                              cr, nrnb, constr, &top->idef);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
 
-            if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+            if (ir->eI == eiVVAK)
             {
-                wallcycle_start(wcycle, ewcUPDATE);
-                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-                if (bTrotter)
-                {
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-                    /* We can only do Berendsen coupling after we have summed
-                     * the kinetic energy or virial. Since the happens
-                     * in global_state after update, we should only do it at
-                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-                     */
-                }
-                else
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
                 {
-                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
                 }
+                copy_rvecn(state->x, cbuf, 0, state->natoms);
+            }
+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
-                if (bVV)
-                {
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                          bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                          ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+            wallcycle_stop(wcycle, ewcUPDATE);
 
-                    /* velocity half-step update */
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-                                  cr, nrnb, constr, &top->idef);
-                }
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
 
-                /* Above, initialize just copies ekinh into ekin,
-                 * it doesn't copy position (for VV),
-                 * and entire integrator for MD.
-                 */
+            if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+            {
+                /* Correct the virial for multiple time stepping */
+                m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+            }
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, lastbox,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, state->x, 0, state->natoms);
 
-                if (ir->eI == eiVVAK)
-                {
-                    /* We probably only need md->homenr, not state->natoms */
-                    if (state->natoms > cbuf_nalloc)
-                    {
-                        cbuf_nalloc = state->natoms;
-                        srenew(cbuf, cbuf_nalloc);
-                    }
-                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-                }
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                 update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
@@ -1526,129 +1565,93 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                               ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
-                update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
-                                   fr->bMolPBC, graph, f,
-                                   &top->idef, shake_vir,
-                                   cr, nrnb, wcycle, upd, constr,
+                /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, NULL,
                                    FALSE, bCalcVir);
-
-                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                {
-                    /* Correct the virial for multiple time stepping */
-                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                }
-
-                if (ir->eI == eiVVAK)
-                {
-                    /* erase F_EKIN and F_TEMP here? */
-                    /* just compute the kinetic energy at the half step to perform a trotter step */
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, lastbox,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags | CGLO_TEMPERATURE
-                                    );
-                    wallcycle_start(wcycle, ewcUPDATE);
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-                    /* now we know the scaling, we can compute the positions again again */
-                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-                    wallcycle_stop(wcycle, ewcUPDATE);
-
-                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-                    /* are the small terms in the shake_vir here due
-                     * to numerical errors, or are they important
-                     * physically? I'm thinking they are just errors, but not completely sure.
-                     * For now, will call without actually constraining, constr=NULL*/
-                    update_constraints(fplog, step, NULL, ir, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, tmp_vir,
-                                       cr, nrnb, wcycle, upd, NULL,
-                                       FALSE, bCalcVir);
-                }
-                if (bVV)
-                {
-                    /* this factor or 2 correction is necessary
-                       because half of the constraint force is removed
-                       in the vv step, so we have to double it.  See
-                       the Redmine issue #1255.  It is not yet clear
-                       if the factor of 2 is exact, or just a very
-                       good approximation, and this will be
-                       investigated.  The next step is to see if this
-                       can be done adding a dhdl contribution from the
-                       rattle step, but this is somewhat more
-                       complicated with the current code. Will be
-                       investigated, hopefully for 4.6.3. However,
-                       this current solution is much better than
-                       having it completely wrong.
-                     */
-                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-                }
-                else
-                {
-                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-                }
             }
-            else if (graph)
+            if (bVV)
             {
-                /* Need to unshift here */
-                unshift_self(graph, state->box, state->x);
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
             }
-
-            if (vsite != NULL)
+            else
             {
-                wallcycle_start(wcycle, ewcVSITECONSTR);
-                if (graph != NULL)
-                {
-                    shift_self(graph, state->box, state->x);
-                }
-                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-                                 top->idef.iparams, top->idef.il,
-                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-
-                if (graph != NULL)
-                {
-                    unshift_self(graph, state->box, state->x);
-                }
-                wallcycle_stop(wcycle, ewcVSITECONSTR);
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
             }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, state->x);
+        }
 
-            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-            /* With Leap-Frog we can skip compute_globals at
-             * non-communication steps, but we need to calculate
-             * the kinetic energy one step before communication.
-             */
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        if (vsite != NULL)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != NULL)
             {
-                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                constr, &gs,
-                                (step_rel % gs.nstms == 0) &&
-                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-                                lastbox,
-                                top_global, &bSumEkinhOld,
-                                cglo_flags
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-                                | CGLO_CONSTRAINT
-                                );
+                shift_self(graph, state->box, state->x);
             }
+            construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
 
-            /* #############  END CALC EKIN AND PRESSURE ################# */
+            if (graph != NULL)
+            {
+                unshift_self(graph, state->box, state->x);
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
 
-            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-               the virial that should probably be addressed eventually. state->veta has better properies,
-               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        {
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &gs,
+                            (step_rel % gs.nstms == 0) &&
+                            (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                            lastbox,
+                            top_global, &bSumEkinhOld,
+                            cglo_flags
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                            | CGLO_CONSTRAINT
+                            );
         }
-        /* TODO remove the brace above, once iteration is removed */
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 
         if (ir->efep != efepNO && (!bVV || bRerunMD))
         {
@@ -1725,7 +1728,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
             if (ir->bPull)
             {
-                pull_print_output(ir->pull, step, t);
+                pull_print_output(ir->pull_work, step, t);
             }
 
             if (do_per_step(step, ir->nstlog))
@@ -1743,7 +1746,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             state->fep_state = lamnew;
         }
         /* Print the remaining wall clock time for the run */
-        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
         {
             if (shellfc)
             {
@@ -1827,97 +1832,17 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
         }
 
-        if (!bRerunMD || !rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-
         cycles = wallcycle_stop(wcycle, ewcSTEP);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
         }
 
-        if (bPMETuneRunning || bPMETuneTry)
+        if (!bRerunMD || !rerun_fr.bStep)
         {
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-
-            /* Count the total cycles over the last steps */
-            cycles_pmes += cycles;
-
-            /* We can only switch cut-off at NS steps */
-            if (step % ir->nstlist == 0)
-            {
-                /* PME grid + cut-off optimization with GPUs or PME nodes */
-                if (bPMETuneTry)
-                {
-                    if (DDMASTER(cr->dd))
-                    {
-                        /* PME node load is too high, start tuning */
-                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-                    }
-                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-
-                    if (bPMETuneRunning &&
-                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
-                        !(cr->duty & DUTY_PME))
-                    {
-                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-                         * With GPUs + separate PME ranks, we don't want DLB.
-                         * This could happen when we scan coarse grids and
-                         * it would then never be turned off again.
-                         * This would hurt performance at the final, optimal
-                         * grid spacing, where DLB almost never helps.
-                         * Also, DLB can limit the cut-off for PME tuning.
-                         */
-                        dd_dlb_set_lock(cr->dd, TRUE);
-                    }
-
-                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-                    {
-                        bPMETuneTry     = FALSE;
-                    }
-                }
-                if (bPMETuneRunning)
-                {
-                    /* init_step might not be a multiple of nstlist,
-                     * but the first cycle is always skipped anyhow.
-                     */
-                    bPMETuneRunning =
-                        pme_load_balance(pme_loadbal, cr,
-                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                         fplog,
-                                         ir, state, cycles_pmes,
-                                         fr->ic, fr->nbv, &fr->pmedata,
-                                         step);
-
-                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-                    fr->rlist         = fr->ic->rlist;
-                    fr->rlistlong     = fr->ic->rlistlong;
-                    fr->rcoulomb      = fr->ic->rcoulomb;
-                    fr->rvdw          = fr->ic->rvdw;
-
-                    if (ir->eDispCorr != edispcNO)
-                    {
-                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-                    }
-
-                    if (!bPMETuneRunning &&
-                        DOMAINDECOMP(cr) &&
-                        dd_dlb_is_locked(cr->dd))
-                    {
-                        /* Unlock the DLB=auto, DLB is allowed to activate
-                         * (but we don't expect it to activate in most cases).
-                         */
-                        dd_dlb_set_lock(cr->dd, FALSE);
-                    }
-                }
-                cycles_pmes = 0;
-            }
+            /* increase the MD step number */
+            step++;
+            step_rel++;
         }
 
         if (step_rel == wcycle_get_reset_counters(wcycle) ||
@@ -1975,10 +1900,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     done_mdoutf(outf);
     debug_gmx();
 
-    if (pme_loadbal != NULL)
+    if (bPMETune)
     {
-        pme_loadbal_done(pme_loadbal, cr, fplog,
-                         use_GPU(fr->nbv));
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
     }
 
     if (shellfc && fplog)
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp.preplumed b/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp.preplumed
index b58a18443..f13727319 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp.preplumed
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/md.cpp.preplumed
@@ -95,7 +95,7 @@
 #include "gromacs/mdlib/compute_io.h"
 #include "gromacs/mdlib/mdrun_signalling.h"
 #include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/pull.h"
@@ -133,7 +133,10 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
     md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                   gmx_step_str(step, sbuf));
 
-    nbnxn_cuda_reset_timings(nbv);
+    if (use_GPU(nbv))
+    {
+        nbnxn_gpu_reset_timings(nbv);
+    }
 
     wallcycle_stop(wcycle, ewcRUN);
     wallcycle_reset_all(wcycle);
@@ -163,7 +166,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              gmx_edsam_t ed, t_forcerec *fr,
              int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
              real cpt_period, real max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -201,7 +203,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     t_graph          *graph = NULL;
     gmx_signalling_t  gs;
     gmx_groups_t     *groups;
-    gmx_ekindata_t   *ekind, *ekind_save;
+    gmx_ekindata_t   *ekind;
     gmx_shellfc_t     shellfc;
     int               count, nconverged = 0;
     double            tcount                 = 0;
@@ -215,7 +217,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     matrix            lastbox;
     int               lamnew  = 0;
     /* for FEP */
-    int               nstfep;
+    int               nstfep = 0;
     double            cycles;
     real              saved_conserved_quantity = 0;
     real              last_ekin                = 0;
@@ -227,9 +229,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                                                           simulation stops. If equal to zero, don't
                                                                           communicate any more between multisims.*/
     /* PME load balancing data for GPU kernels */
-    pme_load_balancing_t pme_loadbal = NULL;
-    double               cycles_pmes;
-    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+    pme_load_balancing_t *pme_loadbal;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
 
     /* Interactive MD */
     gmx_bool          bIMDstep = FALSE;
@@ -273,19 +275,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
     bGStatEveryStep = (nstglobalcomm == 1);
 
-    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-    {
-        fprintf(fplog,
-                "To reduce the energy communication with nstlist = -1\n"
-                "the neighbor list validity should not be checked at every step,\n"
-                "this means that exact integration is not guaranteed.\n"
-                "The neighbor list validity is checked after:\n"
-                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-                "In most cases this will result in exact integration.\n"
-                "This reduces the energy communication by a factor of 2 to 3.\n"
-                "If you want less energy communication, set nstlist > 3.\n\n");
-    }
-
     if (bRerunMD)
     {
         ir->nstxout_compressed = 0;
@@ -317,9 +306,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* Kinetic energy data */
     snew(ekind, 1);
     init_ekindata(fplog, top_global, &(ir->opts), ekind);
-    /* needed for iteration of constraints */
-    snew(ekind_save, 1);
-    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
     /* Copy the cos acceleration to the groups struct */
     ekind->cosacc.cos_accel = ir->cos_accel;
 
@@ -428,7 +414,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                             state_global, top_global, ir,
                             state, &f, mdatoms, top, fr,
                             vsite, shellfc, constr,
-                            nrnb, wcycle, FALSE);
+                            nrnb, NULL, FALSE);
 
     }
 
@@ -482,26 +468,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                         repl_ex_nst, repl_ex_nex, repl_ex_seed);
     }
 
-    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-     * PME tuning is not supported with PME only for LJ and not for Coulomb.
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
      */
-    if ((Flags & MD_TUNEPME) &&
-        EEL_PME(fr->eeltype) &&
-        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
-        !bRerunMD)
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
     {
-        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-        cycles_pmes = 0;
-        if (cr->duty & DUTY_PME)
-        {
-            /* Start tuning right away, as we can't measure the load */
-            bPMETuneRunning = TRUE;
-        }
-        else
-        {
-            /* Separate PME nodes, we can measure the PP/PME load balance */
-            bPMETuneTry = TRUE;
-        }
+        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
+                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         &bPMETunePrinting);
     }
 
     if (!ir->bContinuation && !bRerunMD)
@@ -538,21 +514,47 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
     debug_gmx();
 
-    /* set free energy calculation frequency as the minimum
-       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-    nstfep = ir->fepvals->nstdhdl;
-    if (ir->bExpanded)
+    if (IR_TWINRANGE(*ir) && repl_ex_nst % ir->nstcalclr != 0)
     {
-        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
+        /* We should exchange at nstcalclr steps to get correct integration */
+        gmx_fatal(FARGS, "The replica exchange period (%d) is not divisible by nstcalclr (%d)", repl_ex_nst, ir->nstcalclr);
     }
-    if (repl_ex_nst > 0)
+
+    if (ir->efep != efepNO)
     {
-        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        /* Set free energy calculation frequency as the greatest common
+         * denominator of nstdhdl and repl_ex_nst.
+         * Check for nstcalclr with twin-range, since we need the long-range
+         * contribution to the free-energy at the correct (nstcalclr) steps.
+         */
+        nstfep = ir->fepvals->nstdhdl;
+        if (ir->bExpanded)
+        {
+            if (IR_TWINRANGE(*ir) &&
+                ir->expandedvals->nstexpanded % ir->nstcalclr != 0)
+            {
+                gmx_fatal(FARGS, "nstexpanded should be divisible by nstcalclr");
+            }
+        }
+        if (repl_ex_nst > 0)
+        {
+            nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
+        }
+        /* We checked divisibility of repl_ex_nst and nstcalclr above */
+        if (IR_TWINRANGE(*ir) && nstfep % ir->nstcalclr != 0)
+        {
+            gmx_incons("nstfep not divisible by nstcalclr");
+        }
     }
 
-    /* I'm assuming we need global communication the first time! MRS */
+    /* Be REALLY careful about what flags you set here. You CANNOT assume
+     * this is the first step, since we might be restarting from a checkpoint,
+     * and in that case we should not do any modifications to the state.
+     */
+    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
+
     cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bStopCM ? CGLO_STOPCM : 0)
                   | (bVV ? CGLO_PRESSURE : 0)
                   | (bVV ? CGLO_CONSTRAINT : 0)
                   | (bRerunMD ? CGLO_RERUNMD : 0)
@@ -746,6 +748,20 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     while (!bLastStep || (bRerunMD && bNotLastFrame))
     {
 
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state, wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
         wallcycle_start(wcycle, ewcSTEP);
 
         if (bRerunMD)
@@ -777,7 +793,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
             set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+            bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
             bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
                             && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
         }
@@ -860,9 +876,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         else
         {
             /* Determine whether or not to do Neighbour Searching and LR */
-            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-
-            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP);
+            bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
         }
 
         /* check whether we should stop because another simulation has
@@ -930,16 +944,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             if (DOMAINDECOMP(cr))
             {
                 /* Repartition the domain decomposition */
-                wallcycle_start(wcycle, ewcDOMDEC);
                 dd_partition_system(fplog, step, cr,
                                     bMasterState, nstglobalcomm,
                                     state_global, top_global, ir,
                                     state, &f, mdatoms, top, fr,
                                     vsite, shellfc, constr,
                                     nrnb, wcycle,
-                                    do_verbose && !bPMETuneRunning);
-                wallcycle_stop(wcycle, ewcDOMDEC);
-                /* If using an iterative integrator, reallocate space to match the decomposition */
+                                    do_verbose && !bPMETunePrinting);
             }
         }
 
@@ -1111,96 +1122,92 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                           ekind, M, upd, bInitStep, etrtVELOCITY1,
                           cr, nrnb, constr, &top->idef);
 
-            /* TODO remove the brace below, once iteration is removed */
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
             {
-                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-                {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    update_constraints(fplog, step, NULL, ir, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, shake_vir,
-                                       cr, nrnb, wcycle, upd, constr,
-                                       TRUE, bCalcVir);
-                    wallcycle_start(wcycle, ewcUPDATE);
-                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                    {
-                        /* Correct the virial for multiple time stepping */
-                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                    }
-                }
-                else if (graph)
-                {
-                    /* Need to unshift here if a do_force has been
-                       called in the previous step */
-                    unshift_self(graph, state->box, state->x);
-                }
-                /* if VV, compute the pressure and constraints */
-                /* For VV2, we strictly only need this if using pressure
-                 * control, but we really would like to have accurate pressures
-                 * printed out.
-                 * Think about ways around this in the future?
-                 * For now, keep this choice in comments.
-                 */
-                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-                bPres = TRUE;
-                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-                if (bCalcEner && ir->eI == eiVVAK)
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
                 {
-                    bSumEkinhOld = TRUE;
+                    /* Correct the virial for multiple time stepping */
+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
                 }
-                /* for vv, the first half of the integration actually corresponds to the previous step.
-                   So we need information from the last step in the first half of the integration */
-                if (bGStat || do_per_step(step-1, nstglobalcomm))
+            }
+            else if (graph)
+            {
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, state->x);
+            }
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
+            {
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, state->box,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
                 {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, state->box,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags
-                                    | CGLO_ENERGY
-                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-                                    | (bPres ? CGLO_PRESSURE : 0)
-                                    | (bPres ? CGLO_CONSTRAINT : 0)
-                                    | CGLO_SCALEEKIN
-                                    );
-                    /* explanation of above:
-                       a) We compute Ekin at the full time step
-                       if 1) we are using the AveVel Ekin, and it's not the
-                       initial step, or 2) if we are using AveEkin, but need the full
-                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-                       EkinAveVel because it's needed for the pressure */
-                    wallcycle_start(wcycle, ewcUPDATE);
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
                 }
-                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-                if (!bInitStep)
+                else
                 {
-                    if (bTrotter)
+                    if (bExchanged)
                     {
-                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-                    }
-                    else
-                    {
-                        if (bExchanged)
-                        {
-                            wallcycle_stop(wcycle, ewcUPDATE);
-                            /* We need the kinetic energy at minus the half step for determining
-                             * the full step kinetic energy and possibly for T-coupling.*/
-                            /* This may not be quite working correctly yet . . . . */
-                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-                                            constr, NULL, FALSE, state->box,
-                                            top_global, &bSumEkinhOld,
-                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-                            wallcycle_start(wcycle, ewcUPDATE);
-                        }
+                        wallcycle_stop(wcycle, ewcUPDATE);
+                        /* We need the kinetic energy at minus the half step for determining
+                         * the full step kinetic energy and possibly for T-coupling.*/
+                        /* This may not be quite working correctly yet . . . . */
+                        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                        wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                        constr, NULL, FALSE, state->box,
+                                        top_global, &bSumEkinhOld,
+                                        CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        wallcycle_start(wcycle, ewcUPDATE);
                     }
                 }
             }
-            /* TODO remove the brace above, once iteration is removed */
             if (bTrotter && !bInitStep)
             {
                 copy_mat(shake_vir, state->svir_prev);
@@ -1364,63 +1371,95 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 }
             }
         }
-        /* TODO remove the brace below, once iteration is removed */
+        /* #########   START SECOND UPDATE STEP ################# */
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
+
+        dvdl_constr = 0;
+
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
         {
-            /* #########   START SECOND UPDATE STEP ################# */
-            /* Box is changed in update() when we do pressure coupling,
-             * but we should still use the old box for energy corrections and when
-             * writing it to the energy file, so it matches the trajectory files for
-             * the same timestep above. Make a copy in a separate array.
-             */
-            copy_mat(state->box, lastbox);
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
+            {
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+            }
 
-            dvdl_constr = 0;
+            if (bVV)
+            {
+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                              ekind, M, upd, FALSE, etrtVELOCITY2,
+                              cr, nrnb, constr, &top->idef);
+            }
+
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
 
-            if (!bRerunMD || rerun_fr.bV || bForceUpdate)
+            if (ir->eI == eiVVAK)
             {
-                wallcycle_start(wcycle, ewcUPDATE);
-                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-                if (bTrotter)
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
                 {
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-                    /* We can only do Berendsen coupling after we have summed
-                     * the kinetic energy or virial. Since the happens
-                     * in global_state after update, we should only do it at
-                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-                     */
-                }
-                else
-                {
-                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
                 }
+                copy_rvecn(state->x, cbuf, 0, state->natoms);
+            }
+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
-                if (bVV)
-                {
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                          bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                          ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+            wallcycle_stop(wcycle, ewcUPDATE);
 
-                    /* velocity half-step update */
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-                                  cr, nrnb, constr, &top->idef);
-                }
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
 
-                /* Above, initialize just copies ekinh into ekin,
-                 * it doesn't copy position (for VV),
-                 * and entire integrator for MD.
-                 */
+            if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+            {
+                /* Correct the virial for multiple time stepping */
+                m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+            }
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, lastbox,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, state->x, 0, state->natoms);
 
-                if (ir->eI == eiVVAK)
-                {
-                    /* We probably only need md->homenr, not state->natoms */
-                    if (state->natoms > cbuf_nalloc)
-                    {
-                        cbuf_nalloc = state->natoms;
-                        srenew(cbuf, cbuf_nalloc);
-                    }
-                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-                }
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                 update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
@@ -1428,129 +1467,93 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                               ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
-                update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
-                                   fr->bMolPBC, graph, f,
-                                   &top->idef, shake_vir,
-                                   cr, nrnb, wcycle, upd, constr,
+                /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, NULL,
                                    FALSE, bCalcVir);
-
-                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                {
-                    /* Correct the virial for multiple time stepping */
-                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                }
-
-                if (ir->eI == eiVVAK)
-                {
-                    /* erase F_EKIN and F_TEMP here? */
-                    /* just compute the kinetic energy at the half step to perform a trotter step */
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, lastbox,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags | CGLO_TEMPERATURE
-                                    );
-                    wallcycle_start(wcycle, ewcUPDATE);
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-                    /* now we know the scaling, we can compute the positions again again */
-                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-                    wallcycle_stop(wcycle, ewcUPDATE);
-
-                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-                    /* are the small terms in the shake_vir here due
-                     * to numerical errors, or are they important
-                     * physically? I'm thinking they are just errors, but not completely sure.
-                     * For now, will call without actually constraining, constr=NULL*/
-                    update_constraints(fplog, step, NULL, ir, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, tmp_vir,
-                                       cr, nrnb, wcycle, upd, NULL,
-                                       FALSE, bCalcVir);
-                }
-                if (bVV)
-                {
-                    /* this factor or 2 correction is necessary
-                       because half of the constraint force is removed
-                       in the vv step, so we have to double it.  See
-                       the Redmine issue #1255.  It is not yet clear
-                       if the factor of 2 is exact, or just a very
-                       good approximation, and this will be
-                       investigated.  The next step is to see if this
-                       can be done adding a dhdl contribution from the
-                       rattle step, but this is somewhat more
-                       complicated with the current code. Will be
-                       investigated, hopefully for 4.6.3. However,
-                       this current solution is much better than
-                       having it completely wrong.
-                     */
-                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-                }
-                else
-                {
-                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-                }
             }
-            else if (graph)
+            if (bVV)
             {
-                /* Need to unshift here */
-                unshift_self(graph, state->box, state->x);
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
             }
-
-            if (vsite != NULL)
+            else
             {
-                wallcycle_start(wcycle, ewcVSITECONSTR);
-                if (graph != NULL)
-                {
-                    shift_self(graph, state->box, state->x);
-                }
-                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-                                 top->idef.iparams, top->idef.il,
-                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-
-                if (graph != NULL)
-                {
-                    unshift_self(graph, state->box, state->x);
-                }
-                wallcycle_stop(wcycle, ewcVSITECONSTR);
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
             }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, state->x);
+        }
 
-            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-            /* With Leap-Frog we can skip compute_globals at
-             * non-communication steps, but we need to calculate
-             * the kinetic energy one step before communication.
-             */
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        if (vsite != NULL)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != NULL)
             {
-                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                constr, &gs,
-                                (step_rel % gs.nstms == 0) &&
-                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-                                lastbox,
-                                top_global, &bSumEkinhOld,
-                                cglo_flags
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-                                | CGLO_CONSTRAINT
-                                );
+                shift_self(graph, state->box, state->x);
             }
+            construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
 
-            /* #############  END CALC EKIN AND PRESSURE ################# */
+            if (graph != NULL)
+            {
+                unshift_self(graph, state->box, state->x);
+            }
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
 
-            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-               the virial that should probably be addressed eventually. state->veta has better properies,
-               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        {
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &gs,
+                            (step_rel % gs.nstms == 0) &&
+                            (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                            lastbox,
+                            top_global, &bSumEkinhOld,
+                            cglo_flags
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                            | CGLO_CONSTRAINT
+                            );
         }
-        /* TODO remove the brace above, once iteration is removed */
+
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 
         if (ir->efep != efepNO && (!bVV || bRerunMD))
         {
@@ -1627,7 +1630,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
             if (ir->bPull)
             {
-                pull_print_output(ir->pull, step, t);
+                pull_print_output(ir->pull_work, step, t);
             }
 
             if (do_per_step(step, ir->nstlog))
@@ -1645,7 +1648,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             state->fep_state = lamnew;
         }
         /* Print the remaining wall clock time for the run */
-        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
         {
             if (shellfc)
             {
@@ -1729,97 +1734,17 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
         }
 
-        if (!bRerunMD || !rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-
         cycles = wallcycle_stop(wcycle, ewcSTEP);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
         }
 
-        if (bPMETuneRunning || bPMETuneTry)
+        if (!bRerunMD || !rerun_fr.bStep)
         {
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-
-            /* Count the total cycles over the last steps */
-            cycles_pmes += cycles;
-
-            /* We can only switch cut-off at NS steps */
-            if (step % ir->nstlist == 0)
-            {
-                /* PME grid + cut-off optimization with GPUs or PME nodes */
-                if (bPMETuneTry)
-                {
-                    if (DDMASTER(cr->dd))
-                    {
-                        /* PME node load is too high, start tuning */
-                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-                    }
-                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-
-                    if (bPMETuneRunning &&
-                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
-                        !(cr->duty & DUTY_PME))
-                    {
-                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-                         * With GPUs + separate PME ranks, we don't want DLB.
-                         * This could happen when we scan coarse grids and
-                         * it would then never be turned off again.
-                         * This would hurt performance at the final, optimal
-                         * grid spacing, where DLB almost never helps.
-                         * Also, DLB can limit the cut-off for PME tuning.
-                         */
-                        dd_dlb_set_lock(cr->dd, TRUE);
-                    }
-
-                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-                    {
-                        bPMETuneTry     = FALSE;
-                    }
-                }
-                if (bPMETuneRunning)
-                {
-                    /* init_step might not be a multiple of nstlist,
-                     * but the first cycle is always skipped anyhow.
-                     */
-                    bPMETuneRunning =
-                        pme_load_balance(pme_loadbal, cr,
-                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                         fplog,
-                                         ir, state, cycles_pmes,
-                                         fr->ic, fr->nbv, &fr->pmedata,
-                                         step);
-
-                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-                    fr->rlist         = fr->ic->rlist;
-                    fr->rlistlong     = fr->ic->rlistlong;
-                    fr->rcoulomb      = fr->ic->rcoulomb;
-                    fr->rvdw          = fr->ic->rvdw;
-
-                    if (ir->eDispCorr != edispcNO)
-                    {
-                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-                    }
-
-                    if (!bPMETuneRunning &&
-                        DOMAINDECOMP(cr) &&
-                        dd_dlb_is_locked(cr->dd))
-                    {
-                        /* Unlock the DLB=auto, DLB is allowed to activate
-                         * (but we don't expect it to activate in most cases).
-                         */
-                        dd_dlb_set_lock(cr->dd, FALSE);
-                    }
-                }
-                cycles_pmes = 0;
-            }
+            /* increase the MD step number */
+            step++;
+            step_rel++;
         }
 
         if (step_rel == wcycle_get_reset_counters(wcycle) ||
@@ -1877,10 +1802,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     done_mdoutf(outf);
     debug_gmx();
 
-    if (pme_loadbal != NULL)
+    if (bPMETune)
     {
-        pme_loadbal_done(pme_loadbal, cr, fplog,
-                         use_GPU(fr->nbv));
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
     }
 
     if (shellfc && fplog)
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp b/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp
index d59a3bef4..ac0ec96e0 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp
@@ -59,8 +59,6 @@
 
 #include "gromacs/commandline/pargs.h"
 #include "gromacs/fileio/filenm.h"
-#include "gromacs/legacyheaders/checkpoint.h"
-#include "gromacs/legacyheaders/copyrite.h"
 #include "gromacs/legacyheaders/macros.h"
 #include "gromacs/legacyheaders/main.h"
 #include "gromacs/legacyheaders/mdrun.h"
@@ -68,6 +66,7 @@
 #include "gromacs/legacyheaders/readinp.h"
 #include "gromacs/legacyheaders/typedefs.h"
 #include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/mdrunutility/handlerestart.h"
 #include "gromacs/utility/fatalerror.h"
 
 #include "mdrun_main.h"
@@ -154,9 +153,11 @@ int gmx_mdrun(int argc, char *argv[])
         "investigation are: polarizability.",
         "[PAR]",
         "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-        "a protein into a membrane. The data file should contain the options",
-        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-        "both apply to this as well.",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
         "[PAR]",
         "The option [TT]-pforce[tt] is useful when you suspect a simulation",
         "crashes due to too large forces. With this option coordinates and",
@@ -173,7 +174,7 @@ int gmx_mdrun(int argc, char *argv[])
         "with the step number.",
         "A simulation can be continued by reading the full state from file",
         "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-        "if no checkpoint file is found, Gromacs just assumes a normal run and",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
         "starts from the first step of the [REF].tpr[ref] file. By default the output",
         "will be appending to the existing output files. The checkpoint file",
         "contains checksums of all output files, such that you will never",
@@ -308,11 +309,10 @@ int gmx_mdrun(int argc, char *argv[])
     real            rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
     char           *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
     real            cpt_period            = 15.0, max_hours = -1;
-    gmx_bool        bAppendFiles          = TRUE;
+    gmx_bool        bTryToAppendFiles     = TRUE;
     gmx_bool        bKeepAndNumCPT        = FALSE;
     gmx_bool        bResetCountersHalfWay = FALSE;
     output_env_t    oenv                  = NULL;
-    const char     *deviceOptions         = "";
 
     /* Non transparent initialization of a complex gmx_hw_opt_t struct.
      * But unfortunately we are not allowed to call a function here,
@@ -392,7 +392,7 @@ int gmx_mdrun(int argc, char *argv[])
           "Checkpoint interval (minutes)" },
         { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
           "Keep and number checkpoint files" },
-        { "-append",  FALSE, etBOOL, {&bAppendFiles},
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
         { "-nsteps",  FALSE, etINT64, {&nsteps},
           "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
@@ -428,11 +428,8 @@ int gmx_mdrun(int argc, char *argv[])
     unsigned long   Flags;
     ivec            ddxyz;
     int             dd_node_order;
-    gmx_bool        bAddPart;
-    FILE           *fplog, *fpmulti;
-    int             sim_part, sim_part_fn;
-    const char     *part_suffix = ".part";
-    char            suffix[STRLEN];
+    gmx_bool        bDoAppendFiles, bStartFromCpt;
+    FILE           *fplog;
     int             rc;
     char          **multidir = NULL;
 
@@ -497,76 +494,19 @@ int gmx_mdrun(int argc, char *argv[])
         gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
     }
 
-    if (nmultisim > 1)
+    if (nmultisim >= 1)
     {
 #ifndef GMX_THREAD_MPI
         gmx_bool bParFn = (multidir == NULL);
         init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 #else
-        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-                  "Please compile GROMACS with MPI support");
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
 #endif
     }
 
-    bAddPart = !bAppendFiles;
-
-    /* Check if there is ANY checkpoint file available */
-    sim_part    = 1;
-    sim_part_fn = sim_part;
-    if (opt2bSet("-cpi", NFILE, fnm))
-    {
-        bAppendFiles =
-            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-                                                          fnm, cr),
-                                            &sim_part_fn, cr,
-                                            bAppendFiles, NFILE, fnm,
-                                            part_suffix, &bAddPart);
-        if (sim_part_fn == 0 && MULTIMASTER(cr))
-        {
-            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-        }
-        else
-        {
-            sim_part = sim_part_fn + 1;
-        }
-
-        if (MULTISIM(cr) && MASTER(cr))
-        {
-            if (MULTIMASTER(cr))
-            {
-                /* Log file is not yet available, so if there's a
-                 * problem we can only write to stderr. */
-                fpmulti = stderr;
-            }
-            else
-            {
-                fpmulti = NULL;
-            }
-            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-        }
-    }
-    else
-    {
-        bAppendFiles = FALSE;
-    }
-
-    if (!bAppendFiles)
-    {
-        sim_part_fn = sim_part;
-    }
-
-    if (bAddPart)
-    {
-        /* Rename all output files (except checkpoint files) */
-        /* create new part name first (zero-filled) */
-        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-
-        add_suffix_to_output_names(fnm, NFILE, suffix);
-        if (MULTIMASTER(cr))
-        {
-            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-        }
-    }
+    handleRestart(cr, bTryToAppendFiles, NFILE, fnm,
+                  &bDoAppendFiles, &bStartFromCpt);
 
     Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
     Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
@@ -575,11 +515,12 @@ int gmx_mdrun(int argc, char *argv[])
     Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
     Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
     Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
+    Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES  : 0);
     Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
     Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bStartFromCpt ? MD_STARTFROMCPT : 0);
     Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+    Flags = Flags | (opt2parg_bSet("-ntomp", asize(pa), pa) ? MD_NTOMPSET : 0);
     Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
     Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
     Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
@@ -587,14 +528,10 @@ int gmx_mdrun(int argc, char *argv[])
     /* We postpone opening the log file if we are appending, so we can
        first truncate the old log file and append to the correct position
        there instead.  */
-    if (MASTER(cr) && !bAppendFiles)
+    if (MASTER(cr) && !bDoAppendFiles)
     {
         gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
                      Flags & MD_APPENDFILES, &fplog);
-        please_cite(fplog, "Hess2008b");
-        please_cite(fplog, "Spoel2005a");
-        please_cite(fplog, "Lindahl2001a");
-        please_cite(fplog, "Berendsen95a");
     }
     else
     {
@@ -637,7 +574,7 @@ int gmx_mdrun(int argc, char *argv[])
                   nbpu_opt[0], nstlist,
                   nsteps, nstepout, resetstep,
                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                  pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
+                  pforce, cpt_period, max_hours, imdport, Flags);
 
     /* PLUMED */
     if(plumedswitch){
@@ -647,7 +584,7 @@ int gmx_mdrun(int argc, char *argv[])
 
     /* Log file has to be closed in mdrunner if we are appending to it
        (fplog not set here) */
-    if (MASTER(cr) && !bAppendFiles)
+    if (MASTER(cr) && !bDoAppendFiles)
     {
         gmx_log_close(fplog);
     }
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp.preplumed b/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp.preplumed
index f1e045a2e..fe58475f3 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp.preplumed
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/mdrun.cpp.preplumed
@@ -59,8 +59,6 @@
 
 #include "gromacs/commandline/pargs.h"
 #include "gromacs/fileio/filenm.h"
-#include "gromacs/legacyheaders/checkpoint.h"
-#include "gromacs/legacyheaders/copyrite.h"
 #include "gromacs/legacyheaders/macros.h"
 #include "gromacs/legacyheaders/main.h"
 #include "gromacs/legacyheaders/mdrun.h"
@@ -68,6 +66,7 @@
 #include "gromacs/legacyheaders/readinp.h"
 #include "gromacs/legacyheaders/typedefs.h"
 #include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/mdrunutility/handlerestart.h"
 #include "gromacs/utility/fatalerror.h"
 
 #include "mdrun_main.h"
@@ -147,9 +146,11 @@ int gmx_mdrun(int argc, char *argv[])
         "investigation are: polarizability.",
         "[PAR]",
         "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-        "a protein into a membrane. The data file should contain the options",
-        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-        "both apply to this as well.",
+        "a protein into a membrane. This module requires a number of settings",
+        "that are provided in a data file that is the argument of this option.",
+        "For more details in membrane embedding, see the documentation in the",
+        "user guide. The options [TT]-mn[tt] and [TT]-mp[tt] are used to provide",
+        "the index and topology files used for the embedding.",
         "[PAR]",
         "The option [TT]-pforce[tt] is useful when you suspect a simulation",
         "crashes due to too large forces. With this option coordinates and",
@@ -166,7 +167,7 @@ int gmx_mdrun(int argc, char *argv[])
         "with the step number.",
         "A simulation can be continued by reading the full state from file",
         "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-        "if no checkpoint file is found, Gromacs just assumes a normal run and",
+        "if no checkpoint file is found, GROMACS just assumes a normal run and",
         "starts from the first step of the [REF].tpr[ref] file. By default the output",
         "will be appending to the existing output files. The checkpoint file",
         "contains checksums of all output files, such that you will never",
@@ -300,11 +301,10 @@ int gmx_mdrun(int argc, char *argv[])
     real            rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
     char           *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
     real            cpt_period            = 15.0, max_hours = -1;
-    gmx_bool        bAppendFiles          = TRUE;
+    gmx_bool        bTryToAppendFiles     = TRUE;
     gmx_bool        bKeepAndNumCPT        = FALSE;
     gmx_bool        bResetCountersHalfWay = FALSE;
     output_env_t    oenv                  = NULL;
-    const char     *deviceOptions         = "";
 
     /* Non transparent initialization of a complex gmx_hw_opt_t struct.
      * But unfortunately we are not allowed to call a function here,
@@ -384,7 +384,7 @@ int gmx_mdrun(int argc, char *argv[])
           "Checkpoint interval (minutes)" },
         { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
           "Keep and number checkpoint files" },
-        { "-append",  FALSE, etBOOL, {&bAppendFiles},
+        { "-append",  FALSE, etBOOL, {&bTryToAppendFiles},
           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
         { "-nsteps",  FALSE, etINT64, {&nsteps},
           "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
@@ -420,11 +420,8 @@ int gmx_mdrun(int argc, char *argv[])
     unsigned long   Flags;
     ivec            ddxyz;
     int             dd_node_order;
-    gmx_bool        bAddPart;
-    FILE           *fplog, *fpmulti;
-    int             sim_part, sim_part_fn;
-    const char     *part_suffix = ".part";
-    char            suffix[STRLEN];
+    gmx_bool        bDoAppendFiles, bStartFromCpt;
+    FILE           *fplog;
     int             rc;
     char          **multidir = NULL;
 
@@ -489,76 +486,19 @@ int gmx_mdrun(int argc, char *argv[])
         gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
     }
 
-    if (nmultisim > 1)
+    if (nmultisim >= 1)
     {
 #ifndef GMX_THREAD_MPI
         gmx_bool bParFn = (multidir == NULL);
         init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 #else
-        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-                  "Please compile GROMACS with MPI support");
+        gmx_fatal(FARGS, "mdrun -multi or -multidir are not supported with the thread-MPI library. "
+                  "Please compile GROMACS with a proper external MPI library.");
 #endif
     }
 
-    bAddPart = !bAppendFiles;
-
-    /* Check if there is ANY checkpoint file available */
-    sim_part    = 1;
-    sim_part_fn = sim_part;
-    if (opt2bSet("-cpi", NFILE, fnm))
-    {
-        bAppendFiles =
-            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-                                                          fnm, cr),
-                                            &sim_part_fn, cr,
-                                            bAppendFiles, NFILE, fnm,
-                                            part_suffix, &bAddPart);
-        if (sim_part_fn == 0 && MULTIMASTER(cr))
-        {
-            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-        }
-        else
-        {
-            sim_part = sim_part_fn + 1;
-        }
-
-        if (MULTISIM(cr) && MASTER(cr))
-        {
-            if (MULTIMASTER(cr))
-            {
-                /* Log file is not yet available, so if there's a
-                 * problem we can only write to stderr. */
-                fpmulti = stderr;
-            }
-            else
-            {
-                fpmulti = NULL;
-            }
-            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-        }
-    }
-    else
-    {
-        bAppendFiles = FALSE;
-    }
-
-    if (!bAppendFiles)
-    {
-        sim_part_fn = sim_part;
-    }
-
-    if (bAddPart)
-    {
-        /* Rename all output files (except checkpoint files) */
-        /* create new part name first (zero-filled) */
-        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-
-        add_suffix_to_output_names(fnm, NFILE, suffix);
-        if (MULTIMASTER(cr))
-        {
-            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-        }
-    }
+    handleRestart(cr, bTryToAppendFiles, NFILE, fnm,
+                  &bDoAppendFiles, &bStartFromCpt);
 
     Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
     Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
@@ -567,11 +507,12 @@ int gmx_mdrun(int argc, char *argv[])
     Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
     Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
     Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
+    Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES  : 0);
     Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
     Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bStartFromCpt ? MD_STARTFROMCPT : 0);
     Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+    Flags = Flags | (opt2parg_bSet("-ntomp", asize(pa), pa) ? MD_NTOMPSET : 0);
     Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
     Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
     Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
@@ -579,14 +520,10 @@ int gmx_mdrun(int argc, char *argv[])
     /* We postpone opening the log file if we are appending, so we can
        first truncate the old log file and append to the correct position
        there instead.  */
-    if (MASTER(cr) && !bAppendFiles)
+    if (MASTER(cr) && !bDoAppendFiles)
     {
         gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
                      Flags & MD_APPENDFILES, &fplog);
-        please_cite(fplog, "Hess2008b");
-        please_cite(fplog, "Spoel2005a");
-        please_cite(fplog, "Lindahl2001a");
-        please_cite(fplog, "Berendsen95a");
     }
     else
     {
@@ -603,11 +540,11 @@ int gmx_mdrun(int argc, char *argv[])
                   nbpu_opt[0], nstlist,
                   nsteps, nstepout, resetstep,
                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                  pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
+                  pforce, cpt_period, max_hours, imdport, Flags);
 
     /* Log file has to be closed in mdrunner if we are appending to it
        (fplog not set here) */
-    if (MASTER(cr) && !bAppendFiles)
+    if (MASTER(cr) && !bDoAppendFiles)
     {
         gmx_log_close(fplog);
     }
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp b/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp
index 1c49b3354..c4683cf83 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -417,7 +417,7 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
     snew(re->order, re->nrepl);
     for (i = 0; i < re->nrepl; i++)
     {
-        snew(re->cyclic[i], re->nrepl);
+        snew(re->cyclic[i], re->nrepl+1);
         snew(re->order[i], re->nrepl);
     }
     /* allocate space for the functions storing the data for the replicas */
diff --git a/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp.preplumed b/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp.preplumed
index a85216bea..4b0e81063 100644
--- a/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp.preplumed
+++ b/patches/gromacs-5.1.0.diff/src/programs/mdrun/repl_ex.cpp.preplumed
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -402,7 +402,7 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
     snew(re->order, re->nrepl);
     for (i = 0; i < re->nrepl; i++)
     {
-        snew(re->cyclic[i], re->nrepl);
+        snew(re->cyclic[i], re->nrepl+1);
         snew(re->order[i], re->nrepl);
     }
     /* allocate space for the functions storing the data for the replicas */
-- 
GitLab