diff --git a/patches/gromacs-4.6.3.diff b/patches/gromacs-4.6.3.diff
deleted file mode 100644
index 652e9523a19e005a57da4c9bb6bb98ed6e9adaac..0000000000000000000000000000000000000000
--- a/patches/gromacs-4.6.3.diff
+++ /dev/null
@@ -1,490 +0,0 @@
-patch -u -l -b -F 5 --suffix=.preplumed "./src/kernel/CMakeLists.txt" << \EOF_EOF
---- ./src/kernel/CMakeLists.txt.preplumed
-+++ ./src/kernel/CMakeLists.txt
-@@ -31,10 +31,12 @@
- #
- # To help us fund GROMACS development, we humbly ask that you cite
- # the research papers on the package. Check out http://www.gromacs.org.
- #
- 
-+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-+
- set(GMXPREPROCESS_SOURCES 
-     add_par.c       
-     calc_verletbuf.c
-     compute_io.c    
-     convparm.c      
-@@ -121,11 +123,11 @@
-     set_target_properties(${PROGRAM} PROPERTIES OUTPUT_NAME "${PROGRAM}${GMX_BINARY_SUFFIX}")
- endforeach()
- 
- add_executable(mdrun ${MDRUN_SOURCES} main.c)
- gmx_add_man_page(mdrun)
--target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
-+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS} ${PLUMED_LOAD})
- set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
- 
- if(GMX_OPENMM)
-     target_link_libraries(mdrun openmm_api_wrapper)
- endif()
-EOF_EOF
-patch -u -l -b -F 5 --suffix=.preplumed "./src/kernel/md.c" << \EOF_EOF
---- ./src/kernel/md.c.preplumed
-+++ ./src/kernel/md.c
-@@ -91,10 +91,16 @@
- #include "membed.h"
- #include "types/nlistheuristics.h"
- #include "types/iteratedconstraints.h"
- #include "nbnxn_cuda_data_mgmt.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #ifdef GMX_LIB_MPI
- #include <mpi.h>
- #endif
- #ifdef GMX_THREAD_MPI
- #include "tmpi.h"
-@@ -235,10 +241,14 @@
-     /* PME load balancing data for GPU kernels */
-     pme_load_balancing_t pme_loadbal = NULL;
-     double               cycles_pmes;
-     gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
- 
-+/* PLUMED */
-+    int plumedNeedsEnergy;
-+/* END PLUMED */
-+
- #ifdef GMX_FAHCORE
-     /* Temporary addition for FAHCORE checkpointing */
-     int chkpt_ret;
- #endif
- 
-@@ -715,10 +725,50 @@
-             }
-         }
-         fprintf(fplog, "\n");
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        plumed_cmd(plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-+      plumed_cmd(plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-+      plumed_cmd(plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }else{
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&mdatoms->homenr);
-+          plumed_cmd(plumedmain,"setAtomsContiguous",&mdatoms->start);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* Set and write start time */
-     runtime_start(runtime);
-     print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
-     wallcycle_start(wcycle, ewcRUN);
-     if (fplog)
-@@ -1031,10 +1081,17 @@
-                                     vsite, shellfc, constr,
-                                     nrnb, wcycle,
-                                     do_verbose && !bPMETuneRunning);
-                 wallcycle_stop(wcycle, ewcDOMDEC);
-                 /* If using an iterative integrator, reallocate space to match the decomposition */
-+
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+                }
-+                /* END PLUMED */
-             }
-         }
- 
-         if (MASTER(cr) && do_log && !bFFscan)
-         {
-@@ -1176,16 +1233,39 @@
-             /* The coordinates (x) are shifted (to get whole molecules)
-              * in do_force.
-              * This is parallellized as well, and does communication too.
-              * Check comments in sim_util.c
-              */
-+ 
-+            /* PLUMED */
-+            plumedNeedsEnergy=0;
-+            if(plumedswitch){
-+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-+              plumed_cmd(plumedmain,"setPositions",&state->x[mdatoms->start][0]);
-+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[mdatoms->start]);
-+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[mdatoms->start]);
-+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-+              plumed_cmd(plumedmain,"prepareCalc",NULL);
-+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+            }
-+            /* END PLUMED */
-             do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
-                      state->box, state->x, &state->hist,
-                      f, force_vir, mdatoms, enerd, fcd,
-                      state->lambda, graph,
-                      fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
--                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+                     (plumedNeedsEnergy? GMX_FORCE_ENERGY : 0) |(bNS ? GMX_FORCE_NS : 0) | force_flags);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              if(plumedNeedsEnergy) plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+              plumed_cmd(plumedmain,"setForces",&f[mdatoms->start][0]);
-+              plumed_cmd(plumedmain,"setVirial",&force_vir[0][0]);
-+              plumed_cmd(plumedmain,"performCalc",NULL);
-+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
-+            }
-+            /* END PLUMED */
-         }
- 
-         GMX_BARRIER(cr->mpi_comm_mygroup);
- 
-         if (bTCR)
-EOF_EOF
-patch -u -l -b -F 5 --suffix=.preplumed "./src/kernel/mdrun.c" << \EOF_EOF
---- ./src/kernel/mdrun.c.preplumed
-+++ ./src/kernel/mdrun.c
-@@ -56,10 +56,16 @@
- #endif
- 
- /* afm stuf */
- #include "pull.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+int    plumedswitch;
-+plumed plumedmain;
-+/* END PLUMED */
-+
- int cmain(int argc, char *argv[])
- {
-     const char   *desc[] = {
-         "The [TT]mdrun[tt] program is the main computational chemistry engine",
-         "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
-@@ -399,10 +405,11 @@
-         { efLOG, "-rs",     "rotslabs", ffOPTWR },
-         { efLOG, "-rt",     "rottorque", ffOPTWR },
-         { efMTX, "-mtx",    "nm",       ffOPTWR },
-         { efNDX, "-dn",     "dipole",   ffOPTWR },
-         { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
-         { efDAT, "-membed", "membed",   ffOPTRD },
-         { efTOP, "-mp",     "membed",   ffOPTRD },
-         { efNDX, "-mn",     "membed",   ffOPTRD }
-     };
- #define NFILE asize(fnm)
-@@ -731,19 +738,50 @@
-     }
- 
-     ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-     ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-     ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+    /* PLUMED */
-+    plumedswitch=0;
-+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
-+    if(plumedswitch){
-+      int plumed_is_there=0;
-+      int real_precision=sizeof(real);
-+      real energyUnits=1.0;
-+      real lengthUnits=1.0;
-+      real timeUnits=1.0;
-+  
-+  
-+      if(!plumed_installed()){
-+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-+      }
-+      plumedmain=plumed_create();
-+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-+      // this is not necessary for gromacs units:
-+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-+      //
-+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
-+      plumedswitch=1;
-+    }
-+    /* END PLUMED */
- 
-     rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-                   nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-                   dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-                   nbpu_opt[0],
-                   nsteps, nstepout, resetstep,
-                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                   pforce, cpt_period, max_hours, deviceOptions, Flags);
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      plumed_finalize(plumedmain);
-+    }
-+    /* END PLUMED */
-+  
-     gmx_finalize_par();
- 
-     if (MULTIMASTER(cr))
-     {
-         thanx(stderr);
-EOF_EOF
-patch -u -l -b -F 5 --suffix=.preplumed "./src/kernel/repl_ex.c" << \EOF_EOF
---- ./src/kernel/repl_ex.c.preplumed
-+++ ./src/kernel/repl_ex.c
-@@ -51,10 +51,16 @@
- #include "names.h"
- #include "mvdata.h"
- #include "domdec.h"
- #include "partdec.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #define PROBABILITYCUTOFF 100
- /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
- 
- enum {
-     ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
-@@ -111,18 +117,20 @@
- 
-     snew(qall, ms->nsim);
-     qall[re->repl] = q;
-     gmx_sum_sim(ms->nsim, qall, ms);
- 
--    bDiff = FALSE;
--    for (s = 1; s < ms->nsim; s++)
--    {
--        if (qall[s] != qall[0])
--        {
-+    /* PLUMED */
-+    //bDiff = FALSE;
-+    //for (s = 1; s < ms->nsim; s++)
-+    //{
-+    //    if (qall[s] != qall[0])
-+    //    {
-             bDiff = TRUE;
--        }
--    }
-+    //    }
-+    //}
-+    /* END PLUMED */
- 
-     if (bDiff)
-     {
-         /* Set the replica exchange type and quantities */
-         re->type = ere;
-@@ -255,10 +263,14 @@
-     for (i = 0; i < re->nrepl; i++)
-     {
-         re->ind[i] = i;
-     }
- 
-+    /* PLUMED */
-+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-+    // in those cases replicas can share the same temperature.
-+    /*
-     if (re->type < ereENDSINGLE)
-     {
- 
-         for (i = 0; i < re->nrepl; i++)
-         {
-@@ -275,10 +287,12 @@
-                     gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-                 }
-             }
-         }
-     }
-+    */
-+    /* END PLUMED */
- 
-     /* keep track of all the swaps, starting with the initial placement. */
-     snew(re->allswaps, re->nrepl);
-     for (i = 0; i < re->nrepl; i++)
-     {
-@@ -976,10 +990,14 @@
-     for (i = 0; i < re->nrepl; i++)
-     {
-         pind[i] = re->ind[i];
-     }
- 
-+    /* PLUMED */
-+    int plumed_test_exchange_pattern=0;
-+    /* END PLUMED */
-+
-     if (bMultiEx)
-     {
-         /* multiple random switch exchange */
-         for (i = 0; i < re->nex; i++)
-         {
-@@ -1045,19 +1063,56 @@
-     }
-     else
-     {
-         /* standard nearest neighbor replica exchange */
-         m = (step / re->nst) % 2;
-+        /* PLUMED */
-+        if(plumedswitch){
-+          int partner=re->repl;
-+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-+          if(plumed_test_exchange_pattern>0){
-+            int *list;
-+            snew(list,re->nrepl);
-+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-+            plumed_cmd(plumedmain,"getExchangesList",list);
-+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-+            sfree(list);
-+          }
-+
-+          for(i=1; i<re->nrepl; i++) {
-+            if (i % 2 != m) continue;
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+            if(re->repl==a) partner=b;
-+            if(re->repl==b) partner=a;
-+          }
-+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-+          plumed_cmd(plumedmain,"GREX calculate",NULL);
-+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
-+        }
-+        /* END PLUMED */
-         for (i = 1; i < re->nrepl; i++)
-         {
-             a = re->ind[i-1];
-             b = re->ind[i];
- 
-             bPrint = (re->repl == a || re->repl == b);
-             if (i % 2 == m)
-             {
-                 delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  real adb,bdb,dplumed;
-+                  char buf[300];
-+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-+                  delta+=dplumed;
-+                  if (bPrint)
-+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-+                }
-+                /* END PLUMED */
-                 if (delta <= 0)
-                 {
-                     /* accepted */
-                     prob[i] = 1;
-                     bEx[i]  = TRUE;
-@@ -1077,15 +1132,26 @@
-                 }
-                 re->prob_sum[i] += prob[i];
- 
-                 if (bEx[i])
-                 {
-+                  /* PLUMED */
-+                  if(!plumed_test_exchange_pattern) {
-+                    /* standard neighbour swapping */
-                     /* swap these two */
-                     tmp       = pind[i-1];
-                     pind[i-1] = pind[i];
-                     pind[i]   = tmp;
-                     re->nexchange[i]++;  /* statistics for back compatibility */
-+                  } else {
-+                    /* alternative swapping patterns */
-+                    tmp       = pind[a];
-+                    pind[a]   = pind[b];
-+                    pind[b]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                  }
-+                  /* END PLUMED */
-                 }
-             }
-             else
-             {
-                 prob[i] = -1;
-@@ -1097,10 +1163,19 @@
-         print_prob(fplog, "pr", re->nrepl, prob);
-         fprintf(fplog, "\n");
-         re->nattempt[m]++;
-     }
- 
-+    /* PLUMED */
-+    if(plumed_test_exchange_pattern>0) {
-+      for (i = 0; i < re->nrepl; i++)
-+      {
-+          re->ind[i] = i;
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* record which moves were made and accepted */
-     for (i = 0; i < re->nrepl; i++)
-     {
-         re->nmoves[re->ind[i]][pind[i]] += 1;
-         re->nmoves[pind[i]][re->ind[i]] += 1;
-@@ -1306,10 +1381,14 @@
-      * exchanges. */
-     /* Where each replica ends up after the exchange attempt(s). */
-     /* The order in which multiple exchanges will occur. */
-     gmx_bool bThisReplicaExchanged = FALSE;
- 
-+    /* PLUMED */
-+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
-+    /* END PLUMED */
-+
-     if (MASTER(cr))
-     {
-         replica_id  = re->repl;
-         test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-         prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
-@@ -1355,14 +1434,14 @@
-                 exchange_partner = re->order[replica_id][j];
- 
-                 if (exchange_partner != replica_id)
-                 {
-                     /* Exchange the global states between the master nodes */
--                    if (debug)
--                    {
--                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
--                    }
-+                    //if (debug)
-+                    //{
-+                        fprintf(fplog, "Exchanging %d with %d\n", replica_id, exchange_partner);
-+                    //}
-                     exchange_state(cr->ms, exchange_partner, state);
-                 }
-             }
-             /* For temperature-type replica exchange, we need to scale
-              * the velocities. */
-EOF_EOF
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt b/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ed862a7cab99b767626ee93d3b2fdf298f0abf4
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt
@@ -0,0 +1,197 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+
+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
+
+set(GMXPREPROCESS_SOURCES 
+    add_par.c       
+    calc_verletbuf.c
+    compute_io.c    
+    convparm.c      
+    gen_ad.c        
+    gen_vsite.c     
+    genhydro.c   
+    gpp_atomtype.c  
+    gpp_bond_atomtype.c     
+    h_db.c          
+    hackblock.c             
+    hizzie.c        
+    nm2type.c
+    pdb2top.c       
+    pgutil.c        
+    readir.c        
+    readpull.c      
+    readadress.c      
+    readrot.c
+    resall.c        
+    sorting.c       
+    specbond.c      
+    ter_db.c        
+    tomorse.c       
+    topdirs.c       
+    topexcl.c       
+    topio.c         
+    toppush.c       
+    topshake.c      
+    toputil.c       
+    tpbcmp.c        
+    vsite_parm.c    
+    fflibutil.c
+    xlate.c)
+
+set(MDRUN_SOURCES 
+    gctio.c    ionize.c runner.c
+    do_gct.c     repl_ex.c  xutils.c pme_loadbal.c
+    md.c         mdrun.c    genalg.c membed.c)
+
+add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
+target_link_libraries(gmxpreprocess md)
+set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+    COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+
+if(GMX_GPU)
+    include_directories(${CMAKE_SOURCE_DIR}/src/gmxlib/gpu_utils)
+endif()
+
+if(GMX_OPENMM)
+    # Even though the OpenMM build has "moved to contrib", many things
+    # have be be done from within the scope of the CMakeLists.txt that
+    # builds its mdrun, and that is here
+    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/src/contrib)
+    find_package(OpenMM)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+    include(${CMAKE_SOURCE_DIR}/src/contrib/BuildMdrunOpenMM.cmake)
+endif(GMX_OPENMM)
+
+if(GMX_GPU OR GMX_FORCE_CXX)
+    set_source_files_properties(main.c PROPERTIES LANGUAGE CXX)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        set_source_files_properties(main.c PROPERTIES COMPILE_FLAGS "-x c++")
+    endif()
+endif()
+
+if(GMX_FAHCORE)
+  add_library(fahcore ${MDRUN_SOURCES})
+else(GMX_FAHCORE)
+
+set(GMX_KERNEL_PROGRAMS
+    grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
+if (NOT GMX_NO_QUOTES)
+  set(GMX_KERNEL_PROGRAMS ${GMX_KERNEL_PROGRAMS} g_luck)
+endif (NOT GMX_NO_QUOTES)
+
+
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
+    add_executable(${PROGRAM} ${PROGRAM}.c main.c)
+    if (NOT ${PROGRAM} STREQUAL "g_luck")
+        gmx_add_man_page(${PROGRAM})
+    endif()
+    target_link_libraries(${PROGRAM} gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
+    set_target_properties(${PROGRAM} PROPERTIES OUTPUT_NAME "${PROGRAM}${GMX_BINARY_SUFFIX}")
+endforeach()
+
+add_executable(mdrun ${MDRUN_SOURCES} main.c)
+gmx_add_man_page(mdrun)
+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS} ${PLUMED_LOAD})
+set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+if(GMX_OPENMM)
+    target_link_libraries(mdrun openmm_api_wrapper)
+endif()
+
+# Construct component groups for installation; note that a component may
+# belong to only one group
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
+    set(CPACK_COMPONENT_${PROGRAM}_GROUP tools)
+endforeach()
+set(CPACK_COMPONENT_MDRUN_GROUP mdrun)
+
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS} mdrun)
+    # Manage CPack component dependencies
+    set(CPACK_COMPONENT_${PROGRAM}_DEPENDS libraries libraries-gmxpreprocess)
+
+    # Create custom install-xxxx target
+    if (BUILD_SHARED_LIBS)
+    # If shared libraries are used, we need to install the libraries in
+    # addition to the mdrun binary.
+       add_custom_target(install-${PROGRAM}
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries-gmxpreprocess
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMENT "Installing ${PROGRAM}")
+    else()
+       add_custom_target(install-${PROGRAM}
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMENT "Installing ${PROGRAM}")
+    endif()
+    add_dependencies(install-${PROGRAM} ${PROGRAM})
+
+    # Finally, trigger installation
+    install(
+      TARGETS ${PROGRAM}
+      COMPONENT ${PROGRAM}
+      DESTINATION ${BIN_INSTALL_DIR}
+      )
+endforeach()
+
+install(TARGETS gmxpreprocess DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries-gmxpreprocess)
+
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_GPU)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
+    endif()
+endif ()
+endif(GMX_FAHCORE)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxpreprocess.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt.preplumed b/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..e2125333387440d266eb0e6e77f23e8adb635511
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt.preplumed
@@ -0,0 +1,195 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+
+set(GMXPREPROCESS_SOURCES 
+    add_par.c       
+    calc_verletbuf.c
+    compute_io.c    
+    convparm.c      
+    gen_ad.c        
+    gen_vsite.c     
+    genhydro.c   
+    gpp_atomtype.c  
+    gpp_bond_atomtype.c     
+    h_db.c          
+    hackblock.c             
+    hizzie.c        
+    nm2type.c
+    pdb2top.c       
+    pgutil.c        
+    readir.c        
+    readpull.c      
+    readadress.c      
+    readrot.c
+    resall.c        
+    sorting.c       
+    specbond.c      
+    ter_db.c        
+    tomorse.c       
+    topdirs.c       
+    topexcl.c       
+    topio.c         
+    toppush.c       
+    topshake.c      
+    toputil.c       
+    tpbcmp.c        
+    vsite_parm.c    
+    fflibutil.c
+    xlate.c)
+
+set(MDRUN_SOURCES 
+    gctio.c    ionize.c runner.c
+    do_gct.c     repl_ex.c  xutils.c pme_loadbal.c
+    md.c         mdrun.c    genalg.c membed.c)
+
+add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
+target_link_libraries(gmxpreprocess md)
+set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+    COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+
+if(GMX_GPU)
+    include_directories(${CMAKE_SOURCE_DIR}/src/gmxlib/gpu_utils)
+endif()
+
+if(GMX_OPENMM)
+    # Even though the OpenMM build has "moved to contrib", many things
+    # have be be done from within the scope of the CMakeLists.txt that
+    # builds its mdrun, and that is here
+    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/src/contrib)
+    find_package(OpenMM)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+    include(${CMAKE_SOURCE_DIR}/src/contrib/BuildMdrunOpenMM.cmake)
+endif(GMX_OPENMM)
+
+if(GMX_GPU OR GMX_FORCE_CXX)
+    set_source_files_properties(main.c PROPERTIES LANGUAGE CXX)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        set_source_files_properties(main.c PROPERTIES COMPILE_FLAGS "-x c++")
+    endif()
+endif()
+
+if(GMX_FAHCORE)
+  add_library(fahcore ${MDRUN_SOURCES})
+else(GMX_FAHCORE)
+
+set(GMX_KERNEL_PROGRAMS
+    grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
+if (NOT GMX_NO_QUOTES)
+  set(GMX_KERNEL_PROGRAMS ${GMX_KERNEL_PROGRAMS} g_luck)
+endif (NOT GMX_NO_QUOTES)
+
+
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
+    add_executable(${PROGRAM} ${PROGRAM}.c main.c)
+    if (NOT ${PROGRAM} STREQUAL "g_luck")
+        gmx_add_man_page(${PROGRAM})
+    endif()
+    target_link_libraries(${PROGRAM} gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
+    set_target_properties(${PROGRAM} PROPERTIES OUTPUT_NAME "${PROGRAM}${GMX_BINARY_SUFFIX}")
+endforeach()
+
+add_executable(mdrun ${MDRUN_SOURCES} main.c)
+gmx_add_man_page(mdrun)
+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
+set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+
+if(GMX_OPENMM)
+    target_link_libraries(mdrun openmm_api_wrapper)
+endif()
+
+# Construct component groups for installation; note that a component may
+# belong to only one group
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
+    set(CPACK_COMPONENT_${PROGRAM}_GROUP tools)
+endforeach()
+set(CPACK_COMPONENT_MDRUN_GROUP mdrun)
+
+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS} mdrun)
+    # Manage CPack component dependencies
+    set(CPACK_COMPONENT_${PROGRAM}_DEPENDS libraries libraries-gmxpreprocess)
+
+    # Create custom install-xxxx target
+    if (BUILD_SHARED_LIBS)
+    # If shared libraries are used, we need to install the libraries in
+    # addition to the mdrun binary.
+       add_custom_target(install-${PROGRAM}
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries-gmxpreprocess
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMENT "Installing ${PROGRAM}")
+    else()
+       add_custom_target(install-${PROGRAM}
+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+           COMMENT "Installing ${PROGRAM}")
+    endif()
+    add_dependencies(install-${PROGRAM} ${PROGRAM})
+
+    # Finally, trigger installation
+    install(
+      TARGETS ${PROGRAM}
+      COMPONENT ${PROGRAM}
+      DESTINATION ${BIN_INSTALL_DIR}
+      )
+endforeach()
+
+install(TARGETS gmxpreprocess DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries-gmxpreprocess)
+
+if (INSTALL_CUDART_LIB) #can be set manual by user
+    if (GMX_GPU)
+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
+            if(IS_CUDART) #libcuda should not be installed
+                #install also name-links (linker uses those)
+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
+                install(FILES ${CUDA_LIBS} DESTINATION
+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
+            endif()
+        endforeach()
+    else()
+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
+    endif()
+endif ()
+endif(GMX_FAHCORE)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxpreprocess.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/md.c b/patches/gromacs-4.6.3.diff/src/kernel/md.c
new file mode 100644
index 0000000000000000000000000000000000000000..187b2796a09a2cf80c82d9637db77093835ce358
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/md.c
@@ -0,0 +1,2340 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "vec.h"
+#include "statutil.h"
+#include "vcm.h"
+#include "mdebin.h"
+#include "nrnb.h"
+#include "calcmu.h"
+#include "index.h"
+#include "vsite.h"
+#include "update.h"
+#include "ns.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "mdrun.h"
+#include "md_support.h"
+#include "md_logging.h"
+#include "confio.h"
+#include "network.h"
+#include "pull.h"
+#include "xvgr.h"
+#include "physics.h"
+#include "names.h"
+#include "xmdrun.h"
+#include "ionize.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "mpelogging.h"
+#include "domdec.h"
+#include "domdec_network.h"
+#include "partdec.h"
+#include "topsort.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "shellfc.h"
+#include "compute_io.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "string2.h"
+#include "pme_loadbal.h"
+#include "bondf.h"
+#include "membed.h"
+#include "types/nlistheuristics.h"
+#include "types/iteratedconstraints.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+/* PLUMED */
+#include "../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#ifdef GMX_LIB_MPI
+#include <mpi.h>
+#endif
+#ifdef GMX_THREAD_MPI
+#include "tmpi.h"
+#endif
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+static void reset_all_counters(FILE *fplog, t_commrec *cr,
+                               gmx_large_int_t step,
+                               gmx_large_int_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_runtime_t *runtime,
+                               nbnxn_cuda_ptr_t cu_nbv)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+                  gmx_step_str(step, sbuf));
+
+    if (cu_nbv)
+    {
+        nbnxn_cuda_reset_timings(cu_nbv);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    runtime_start(runtime);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
+}
+
+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
+             int nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int stepout, t_inputrec *ir,
+             gmx_mtop_t *top_global,
+             t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t ed, t_forcerec *fr,
+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
+             real cpt_period, real max_hours,
+             const char *deviceOptions,
+             unsigned long Flags,
+             gmx_runtime_t *runtime)
+{
+    gmx_mdoutf_t   *outf;
+    gmx_large_int_t step, step_rel;
+    double          run_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
+                    bBornRadii, bStartingFromCpt;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    int               mdof_flags;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    t_state          *bufstate = NULL;
+    matrix           *scale_tot, pcoupl_mu, M, ebox;
+    gmx_nlheur_t      nlh;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = NULL;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin = NULL;
+    df_history_t      df_history;
+    t_state          *state    = NULL;
+    rvec             *f_global = NULL;
+    int               n_xtc    = -1;
+    rvec             *x_xtc    = NULL;
+    gmx_enerdata_t   *enerd;
+    rvec             *f = NULL;
+    gmx_global_stat_t gstat;
+    gmx_update_t      upd   = NULL;
+    t_graph          *graph = NULL;
+    globsig_t         gs;
+    gmx_rng_t         mcrng = NULL;
+    gmx_bool          bFFscan;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind, *ekind_save;
+    gmx_shellfc_t     shellfc;
+    int               count, nconverged = 0;
+    real              timestep = 0;
+    double            tcount   = 0;
+    gmx_bool          bIonize  = FALSE;
+    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
+    gmx_bool          bAppend;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
+    gmx_bool          bUpdateDoLR;
+    real              mu_aver = 0, dvdl_constr;
+    int               a0, a1, gnx = 0, ii;
+    atom_id          *grpindex = NULL;
+    char             *grpname;
+    t_coupl_rec      *tcr     = NULL;
+    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
+    matrix            boxcopy = {{0}}, lastbox;
+    tensor            tmpvir;
+    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
+    real              vetanew = 0;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep;
+    real              rate;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    int               iter_i;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+    gmx_iterate_t     iterate;
+    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
+                                                                      simulation stops. If equal to zero, don't
+                                                                      communicate any more between multisims.*/
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t pme_loadbal = NULL;
+    double               cycles_pmes;
+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+
+/* PLUMED */
+    int plumedNeedsEnergy;
+/* END PLUMED */
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+
+    /* Check for special mdrun options */
+    bRerunMD = (Flags & MD_RERUN);
+    bIonize  = (Flags & MD_IONIZE);
+    bFFscan  = (Flags & MD_FFSCAN);
+    bAppend  = (Flags & MD_APPENDFILES);
+    if (Flags & MD_RESETCOUNTERSHALFWAY)
+    {
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (max_hours > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bVV = EI_VV(ir->eI);
+    if (bVV) /* to store the initial velocities while computing virial */
+    {
+        snew(cbuf, top_global->natoms);
+    }
+    /* all the iteratative cases - only if there are constraints */
+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
+                                          false in this step.  The correct value, true or false,
+                                          is set at each step, as it depends on the frequency of temperature
+                                          and pressure control.*/
+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
+
+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+    {
+        fprintf(fplog,
+                "To reduce the energy communication with nstlist = -1\n"
+                "the neighbor list validity should not be checked at every step,\n"
+                "this means that exact integration is not guaranteed.\n"
+                "The neighbor list validity is checked after:\n"
+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
+                "In most cases this will result in exact integration.\n"
+                "This reduces the energy communication by a factor of 2 to 3.\n"
+                "If you want less energy communication, set nstlist > 3.\n\n");
+    }
+
+    if (bRerunMD || bFFscan)
+    {
+        ir->nstxtcout = 0;
+    }
+    groups = &top_global->groups;
+
+    /* Initial values */
+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+            &(state_global->fep_state), lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+    if (DOMAINDECOMP(cr))
+    {
+        f = NULL;
+    }
+    else
+    {
+        snew(f, top_global->natoms);
+    }
+
+    /* lambda Monte carlo random number generator  */
+    if (ir->bExpanded)
+    {
+        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+    }
+    /* copy the state into df_history */
+    copy_df_history(&df_history, &state_global->dfhist);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* needed for iteration of constraints */
+    snew(ekind_save, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+    debug_gmx();
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 (ir->bContinuation ||
+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+                                 NULL : state_global->x);
+
+    if (DEFORM(*ir))
+    {
+#ifdef GMX_THREAD_MPI
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        snew(state, 1);
+        dd_init_local_state(cr->dd, state_global, state);
+
+        if (DDMASTER(cr->dd) && ir->nstfout)
+        {
+            snew(f_global, state_global->natoms);
+        }
+    }
+    else
+    {
+        if (PAR(cr))
+        {
+            /* Initialize the particle decomposition and split the topology */
+            top = split_system(fplog, top_global, ir, cr);
+
+            pd_cg_range(cr, &fr->cg0, &fr->hcg);
+            pd_at_range(cr, &a0, &a1);
+        }
+        else
+        {
+            top = gmx_mtop_generate_local_top(top_global, ir);
+
+            a0 = 0;
+            a1 = top_global->natoms;
+        }
+
+        forcerec_set_excl_load(fr, top, cr);
+
+        state    = partdec_init_local_state(cr, state_global);
+        f_global = f;
+
+        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, top, mdatoms, cr);
+        }
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+
+        if (shellfc)
+        {
+            make_local_shells(cr, mdatoms, shellfc);
+        }
+
+        init_bonded_thread_force_reduction(fr, &top->idef);
+
+        if (ir->pull && PAR(cr))
+        {
+            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdatoms, top, fr,
+                            vsite, shellfc, constr,
+                            nrnb, wcycle, FALSE);
+
+    }
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
+    }
+    else
+    {
+        bStateFromCP = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        if (bStateFromCP)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (Flags & MD_APPENDFILES)
+            {
+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
+            }
+            else
+            {
+                /* We might have read an energy history from checkpoint,
+                 * free the allocated memory and reset the counts.
+                 */
+                done_energyhistory(&state_global->enerhist);
+                init_energyhistory(&state_global->enerhist);
+            }
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(&state_global->enerhist, mdebin);
+    }
+
+    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
+    {
+        /* Set the random state if we read a checkpoint file */
+        set_stochd_state(upd, state);
+    }
+
+    if (state->flags & (1<<estMC_RNG))
+    {
+        set_mc_state(mcrng, state);
+    }
+
+    /* Initialize constraints */
+    if (constr)
+    {
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, top, ir, mdatoms, cr);
+        }
+    }
+
+    /* Check whether we have to GCT stuff */
+    bTCR = ftp2bSet(efGCT, nfile, fnm);
+    if (bTCR)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Will do General Coupling Theory!\n");
+        }
+        gnx = top_global->mols.nr;
+        snew(grpindex, gnx);
+        for (i = 0; (i < gnx); i++)
+        {
+            grpindex[i] = i;
+        }
+    }
+
+    if (repl_ex_nst > 0)
+    {
+        /* We need to be sure replica exchange can only occur
+         * when the energies are current */
+        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
+                        "repl_ex_nst", &repl_ex_nst);
+        /* This check needs to happen before inter-simulation
+         * signals are initialized, too */
+    }
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+    }
+
+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
+     * With perturbed charges with soft-core we should not change the cut-off.
+     */
+    if ((Flags & MD_TUNEPME) &&
+        EEL_PME(fr->eeltype) &&
+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
+        !bRerunMD)
+    {
+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
+        cycles_pmes = 0;
+        if (cr->duty & DUTY_PME)
+        {
+            /* Start tuning right away, as we can't measure the load */
+            bPMETuneRunning = TRUE;
+        }
+        else
+        {
+            /* Separate PME nodes, we can measure the PP/PME load balance */
+            bPMETuneTry = TRUE;
+        }
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+        {
+            /* Set the velocities of frozen particles to zero */
+            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
+            {
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                    {
+                        state->v[i][m] = 0;
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
+                               graph, cr, nrnb, fr, top, shake_vir);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+        }
+    }
+
+    debug_gmx();
+
+    /* set free energy calculation frequency as the minimum
+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
+    nstfep = ir->fepvals->nstdhdl;
+    if (ir->bExpanded)
+    {
+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
+    }
+    if (repl_ex_nst > 0)
+    {
+        nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
+    }
+
+    /* I'm assuming we need global communication the first time! MRS */
+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bVV ? CGLO_PRESSURE : 0)
+                  | (bVV ? CGLO_CONSTRAINT : 0)
+                  | (bRerunMD ? CGLO_RERUNMD : 0)
+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                    constr, NULL, FALSE, state->box,
+                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, NULL, FALSE, state->box,
+                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!(Flags & MD_STARTFROMCPT))
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+    if (ir->eI != eiVV)
+    {
+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+                                     and there is no previous step */
+    }
+
+    /* if using an iterative algorithm, we need to create a working directory for the state. */
+    if (bIterativeCase)
+    {
+        bufstate = init_bufstate(state);
+    }
+    if (bFFscan)
+    {
+        snew(xcopy, state->natoms);
+        snew(vcopy, state->natoms);
+        copy_rvecn(state->x, xcopy, 0, state->natoms);
+        copy_rvecn(state->v, vcopy, 0, state->natoms);
+        copy_mat(state->box, boxcopy);
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+        {
+            fprintf(fplog,
+                    "RMS relative constraint deviation after constraining: %.2e\n",
+                    constr_rmsd(constr, FALSE));
+        }
+        if (EI_STATE_VELOCITY(ir->eI))
+        {
+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+        }
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (bVerbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* PLUMED */
+    if(plumedswitch){
+      if(cr->ms && cr->ms->nsim>1) {
+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
+        if(PAR(cr)){
+          if(DOMAINDECOMP(cr)) {
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
+          }else{
+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
+          }
+        }
+        plumed_cmd(plumedmain,"GREX init",NULL);
+      }
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
+        }else{
+          plumed_cmd(plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
+        }
+      }
+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
+      plumed_cmd(plumedmain,"setLog",fplog);
+      real real_delta_t;
+      real_delta_t=ir->delta_t;
+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
+      plumed_cmd(plumedmain,"init",NULL);
+
+      if(PAR(cr)){
+        if(DOMAINDECOMP(cr)) {
+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+        }else{
+          plumed_cmd(plumedmain,"setAtomsNlocal",&mdatoms->homenr);
+          plumed_cmd(plumedmain,"setAtomsContiguous",&mdatoms->start);
+        }
+      }
+    }
+    /* END PLUMED */
+
+    /* Set and write start time */
+    runtime_start(runtime);
+    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
+    wallcycle_start(wcycle, ewcRUN);
+    if (fplog)
+    {
+        fprintf(fplog, "\n");
+    }
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    debug_gmx();
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bNotLastFrame = read_first_frame(oenv, &status,
+                                             opt2fn("-rerun", nfile, fnm),
+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* loop over MD steps or if rerunMD to end of input trajectory */
+    bFirstStep = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bStateFromTPX    = !bStateFromCP;
+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+    bLastStep        = FALSE;
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+
+    init_global_signals(&gs, cr, ir, repl_ex_nst);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    if (ir->nstlist == -1)
+    {
+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
+    }
+
+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+    {
+        /* check how many steps are left in other sims */
+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+    }
+
+
+    /* and stop now if we should */
+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+    while (!bLastStep || (bRerunMD && bNotLastFrame))
+    {
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        GMX_MPE_LOG(ev_timestep1);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+
+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+        }
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(&(ir->opts), t);
+        }
+
+        if (bRerunMD)
+        {
+            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+            {
+                for (i = 0; i < state_global->natoms; i++)
+                {
+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
+                }
+                if (rerun_fr.bV)
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        clear_rvec(state_global->v[i]);
+                    }
+                    if (bRerunWarnNoV)
+                    {
+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                                "         Ekin, temperature and pressure are incorrect,\n"
+                                "         the virial will be incorrect when constraints are present.\n"
+                                "\n");
+                        bRerunWarnNoV = FALSE;
+                    }
+                }
+            }
+            copy_mat(rerun_fr.box, state_global->box);
+            copy_mat(state_global->box, state->box);
+
+            if (vsite && (Flags & MD_RERUN_VSITE))
+            {
+                if (DOMAINDECOMP(cr))
+                {
+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+                }
+                if (graph)
+                {
+                    /* Following is necessary because the graph may get out of sync
+                     * with the coordinates if we only have every N'th coordinate set
+                     */
+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+                if (graph)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+            }
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Copy back starting coordinates in case we're doing a forcefield scan */
+        if (bFFscan)
+        {
+            for (ii = 0; (ii < state->natoms); ii++)
+            {
+                copy_rvec(xcopy[ii], state->x[ii]);
+                copy_rvec(vcopy[ii], state->v[ii]);
+            }
+            copy_mat(boxcopy, state->box);
+        }
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+            bNStList = bNS;
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching and LR */
+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
+
+            if (bNS && ir->nstlist == -1)
+            {
+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
+            }
+        }
+
+        /* check whether we should stop because another simulation has
+           stopped. */
+        if (MULTISIM(cr))
+        {
+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
+                 (multisim_nsteps != ir->nsteps) )
+            {
+                if (bNS)
+                {
+                    if (MASTER(cr))
+                    {
+                        fprintf(stderr,
+                                "Stopping simulation %d because another one has finished\n",
+                                cr->ms->sim);
+                    }
+                    bLastStep         = TRUE;
+                    gs.sig[eglsCHKPT] = 1;
+                }
+            }
+        }
+
+        /* < 0 means stop at next step, > 0 means stop at next NS step */
+        if ( (gs.set[eglsSTOPCOND] < 0) ||
+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
+        do_verbose = bVerbose &&
+            (step % stepout == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (DYNAMIC_BOX(*ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                wallcycle_start(wcycle, ewcDOMDEC);
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, shellfc, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETuneRunning);
+                wallcycle_stop(wcycle, ewcDOMDEC);
+                /* If using an iterative integrator, reallocate space to match the decomposition */
+
+                /* PLUMED */
+                if(plumedswitch){
+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
+                }
+                /* END PLUMED */
+            }
+        }
+
+        if (MASTER(cr) && do_log && !bFFscan)
+        {
+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                            constr, NULL, FALSE, state->box,
+                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+        }
+        clear_mat(force_vir);
+
+        /* Ionize the atoms if necessary */
+        if (bIonize)
+        {
+            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
+                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
+        }
+
+        /* Update force field in ffscan program */
+        if (bFFscan)
+        {
+            if (update_forcefield(fplog,
+                                  nfile, fnm, fr,
+                                  mdatoms->nr, state->x, state->box))
+            {
+                gmx_finalize_par();
+
+                exit(0);
+            }
+        }
+
+        GMX_MPE_LOG(ev_timestep2);
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && (Flags & MD_CONFOUT))) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            gs.set[eglsCHKPT] = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
+            bCalcVir  = bCalcEner ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir  = bCalcEner ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+            bGStat    = TRUE;
+        }
+
+        /* these CGLO_ options remain the same throughout the iteration */
+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+                      (bGStat ? CGLO_GSTAT : 0)
+                      );
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       GMX_FORCE_SEPLRF |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (fr->bTwinRange)
+        {
+            if (do_per_step(step, ir->nstcalclr))
+            {
+                force_flags |= GMX_FORCE_DO_LR;
+            }
+        }
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
+                                        ir, bNS, force_flags,
+                                        bStopCM, top, top_global,
+                                        constr, enerd, fcd,
+                                        state, f, force_vir, mdatoms,
+                                        nrnb, wcycle, graph, groups,
+                                        shellfc, fr, bBornRadii, t, mu_tot,
+                                        state->natoms, &bConverged, vsite,
+                                        outf->fp_field);
+            tcount += count;
+
+            if (bConverged)
+            {
+                nconverged++;
+            }
+        }
+        else
+        {
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+ 
+            /* PLUMED */
+            plumedNeedsEnergy=0;
+            if(plumedswitch){
+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
+              plumed_cmd(plumedmain,"setPositions",&state->x[mdatoms->start][0]);
+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[mdatoms->start]);
+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[mdatoms->start]);
+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
+              plumed_cmd(plumedmain,"prepareCalc",NULL);
+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
+            }
+            /* END PLUMED */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
+                     (plumedNeedsEnergy? GMX_FORCE_ENERGY : 0) |(bNS ? GMX_FORCE_NS : 0) | force_flags);
+            /* PLUMED */
+            if(plumedswitch){
+              if(plumedNeedsEnergy) plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
+              plumed_cmd(plumedmain,"setForces",&f[mdatoms->start][0]);
+              plumed_cmd(plumedmain,"setVirial",&force_vir[0][0]);
+              plumed_cmd(plumedmain,"performCalc",NULL);
+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
+            }
+            /* END PLUMED */
+        }
+
+        GMX_BARRIER(cr->mpi_comm_mygroup);
+
+        if (bTCR)
+        {
+            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
+                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
+        }
+
+        if (bTCR && bFirstStep)
+        {
+            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
+            fprintf(fplog, "Done init_coupling\n");
+            fflush(fplog);
+        }
+
+        if (bVV && !bStartingFromCpt && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            /* If we are using twin-range interactions where the long-range component
+             * is only evaluated every nstcalclr>1 steps, we should do a special update
+             * step to combine the long-range forces on these steps.
+             * For nstcalclr=1 this is not done, since the forces would have been added
+             * directly to the short-range forces already.
+             */
+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
+                          f, bUpdateDoLR, fr->f_twin, fcd,
+                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
+                          cr, nrnb, constr, &top->idef);
+
+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
+            {
+                gmx_iterate_init(&iterate, TRUE);
+            }
+            /* for iterations, we save these vectors, as we will be self-consistently iterating
+               the calculations */
+
+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+
+            /* save the state */
+            if (iterate.bIterationActive)
+            {
+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+            }
+
+            bFirstIterate = TRUE;
+            while (bFirstIterate || iterate.bIterationActive)
+            {
+                if (iterate.bIterationActive)
+                {
+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+                    if (bFirstIterate && bTrotter)
+                    {
+                        /* The first time through, we need a decent first estimate
+                           of veta(t+dt) to compute the constraints.  Do
+                           this by computing the box volume part of the
+                           trotter integration at this time. Nothing else
+                           should be changed by this routine here.  If
+                           !(first time), we start with the previous value
+                           of veta.  */
+
+                        veta_save = state->veta;
+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
+                        vetanew     = state->veta;
+                        state->veta = veta_save;
+                    }
+                }
+
+                bOK = TRUE;
+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
+                {
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, shake_vir, NULL,
+                                       cr, nrnb, wcycle, upd, constr,
+                                       bInitStep, TRUE, bCalcVir, vetanew);
+
+                    if (!bOK && !bFFscan)
+                    {
+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+                    }
+
+                }
+                else if (graph)
+                {
+                    /* Need to unshift here if a do_force has been
+                       called in the previous step */
+                    unshift_self(graph, state->box, state->x);
+                }
+
+                /* if VV, compute the pressure and constraints */
+                /* For VV2, we strictly only need this if using pressure
+                 * control, but we really would like to have accurate pressures
+                 * printed out.
+                 * Think about ways around this in the future?
+                 * For now, keep this choice in comments.
+                 */
+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+                bPres = TRUE;
+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
+                {
+                    bSumEkinhOld = TRUE;
+                }
+                /* for vv, the first half of the integration actually corresponds to the previous step.
+                   So we need information from the last step in the first half of the integration */
+                if (bGStat || do_per_step(step-1, nstglobalcomm))
+                {
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                    constr, NULL, FALSE, state->box,
+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                    cglo_flags
+                                    | CGLO_ENERGY
+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
+                                    | (bPres ? CGLO_PRESSURE : 0)
+                                    | (bPres ? CGLO_CONSTRAINT : 0)
+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+                                    | CGLO_SCALEEKIN
+                                    );
+                    /* explanation of above:
+                       a) We compute Ekin at the full time step
+                       if 1) we are using the AveVel Ekin, and it's not the
+                       initial step, or 2) if we are using AveEkin, but need the full
+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                       EkinAveVel because it's needed for the pressure */
+                }
+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+                if (!bInitStep)
+                {
+                    if (bTrotter)
+                    {
+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+                    }
+                    else
+                    {
+                        if (bExchanged)
+                        {
+
+                            /* We need the kinetic energy at minus the half step for determining
+                             * the full step kinetic energy and possibly for T-coupling.*/
+                            /* This may not be quite working correctly yet . . . . */
+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                            constr, NULL, FALSE, state->box,
+                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        }
+                    }
+                }
+
+                if (iterate.bIterationActive &&
+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+                                   state->veta, &vetanew))
+                {
+                    break;
+                }
+                bFirstIterate = FALSE;
+            }
+
+            if (bTrotter && !bInitStep)
+            {
+                copy_mat(shake_vir, state->svir_prev);
+                copy_mat(force_vir, state->fvir_prev);
+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
+                {
+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
+                    enerd->term[F_EKIN] = trace(ekind->ekin);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (bInitStep && ir->eI == eiVV)
+            {
+                copy_rvecn(cbuf, state->v, 0, state->natoms);
+            }
+
+            GMX_MPE_LOG(ev_timestep1);
+        }
+
+        /* MRS -- now done iterating -- compute the conserved quantity */
+        if (bVV)
+        {
+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (!bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+        }
+        /* ################## START TRAJECTORY OUTPUT ################# */
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         * for RerunMD t is read from input trajectory
+         */
+        GMX_MPE_LOG(ev_output_start);
+
+        mdof_flags = 0;
+        if (do_per_step(step, ir->nstxout))
+        {
+            mdof_flags |= MDOF_X;
+        }
+        if (do_per_step(step, ir->nstvout))
+        {
+            mdof_flags |= MDOF_V;
+        }
+        if (do_per_step(step, ir->nstfout))
+        {
+            mdof_flags |= MDOF_F;
+        }
+        if (do_per_step(step, ir->nstxtcout))
+        {
+            mdof_flags |= MDOF_XTC;
+        }
+        if (bCPT)
+        {
+            mdof_flags |= MDOF_CPT;
+        }
+        ;
+
+#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+        if (bLastStep)
+        {
+            /* Enforce writing positions and velocities at end of run */
+            mdof_flags |= (MDOF_X | MDOF_V);
+        }
+#endif
+#ifdef GMX_FAHCORE
+        if (MASTER(cr))
+        {
+            fcReportProgress( ir->nsteps, step );
+        }
+
+        /* sync bCPT and fc record-keeping */
+        if (bCPT && MASTER(cr))
+        {
+            fcRequestCheckPoint();
+        }
+#endif
+
+        if (mdof_flags != 0)
+        {
+            wallcycle_start(wcycle, ewcTRAJ);
+            if (bCPT)
+            {
+                if (state->flags & (1<<estLD_RNG))
+                {
+                    get_stochd_state(upd, state);
+                }
+                if (state->flags  & (1<<estMC_RNG))
+                {
+                    get_mc_state(mcrng, state);
+                }
+                if (MASTER(cr))
+                {
+                    if (bSumEkinhOld)
+                    {
+                        state_global->ekinstate.bUpToDate = FALSE;
+                    }
+                    else
+                    {
+                        update_ekinstate(&state_global->ekinstate, ekind);
+                        state_global->ekinstate.bUpToDate = TRUE;
+                    }
+                    update_energyhistory(&state_global->enerhist, mdebin);
+                    if (ir->efep != efepNO || ir->bSimTemp)
+                    {
+                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+                                                                       structured so this isn't necessary.
+                                                                       Note this reassignment is only necessary
+                                                                       for single threads.*/
+                        copy_df_history(&state_global->dfhist, &df_history);
+                    }
+                }
+            }
+            write_traj(fplog, cr, outf, mdof_flags, top_global,
+                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
+            if (bCPT)
+            {
+                nchkpt++;
+                bCPT = FALSE;
+            }
+            debug_gmx();
+            if (bLastStep && step_rel == ir->nsteps &&
+                (Flags & MD_CONFOUT) && MASTER(cr) &&
+                !bRerunMD && !bFFscan)
+            {
+                /* x and v have been collected in write_traj,
+                 * because a checkpoint file will always be written
+                 * at the last step.
+                 */
+                fprintf(stderr, "\nWriting final coordinates.\n");
+                if (fr->bMolPBC)
+                {
+                    /* Make molecules whole only for confout writing */
+                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
+                }
+                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
+                                    *top_global->name, top_global,
+                                    state_global->x, state_global->v,
+                                    ir->ePBC, state->box);
+                debug_gmx();
+            }
+            wallcycle_stop(wcycle, ewcTRAJ);
+        }
+        GMX_MPE_LOG(ev_output_finish);
+
+        /* kludge -- virial is lost with restart for NPT control. Must restart */
+        if (bStartingFromCpt && bVV)
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+        /*  ################## END TRAJECTORY OUTPUT ################ */
+
+        /* Determine the wallclock run time up till now */
+        run_time = gmx_gettime() - (double)runtime->real;
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#ifdef GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            /* this is just make gs.sig compatible with the hack
+               of sending signals around by MPI_Reduce with together with
+               other floats */
+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+            {
+                gs.sig[eglsSTOPCOND] = 1;
+            }
+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                gs.sig[eglsSTOPCOND] = -1;
+            }
+            /* < 0 means stop at next step, > 0 means stop at next NS step */
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+                        gmx_get_signal_name(),
+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+                    gmx_get_signal_name(),
+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+        {
+            /* Signal to terminate the run */
+            gs.sig[eglsSTOPCOND] = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            run_time > max_hours*60.0*60.0*0.495)
+        {
+            gs.sig[eglsRESETCOUNTERS] = 1;
+        }
+
+        if (ir->nstlist == -1 && !bRerunMD)
+        {
+            /* When bGStatEveryStep=FALSE, global_stat is only called
+             * when we check the atom displacements, not at NS steps.
+             * This means that also the bonded interaction count check is not
+             * performed immediately after NS. Therefore a few MD steps could
+             * be performed with missing interactions.
+             * But wrong energies are never written to file,
+             * since energies are only written after global_stat
+             * has been called.
+             */
+            if (step >= nlh.step_nscheck)
+            {
+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
+                                                     nlh.scale_tot, state->x);
+            }
+            else
+            {
+                /* This is not necessarily true,
+                 * but step_nscheck is determined quite conservatively.
+                 */
+                nlh.nabnsb = 0;
+            }
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            run_time >= nchkpt*cpt_period*60.0)) &&
+            gs.set[eglsCHKPT] == 0)
+        {
+            gs.sig[eglsCHKPT] = 1;
+        }
+
+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+        if (EI_VV(ir->eI))
+        {
+            if (!bInitStep)
+            {
+                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+            }
+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+            {
+                gmx_bool bIfRandomize;
+                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+                if (constr && bIfRandomize)
+                {
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, tmp_vir, NULL,
+                                       cr, nrnb, wcycle, upd, constr,
+                                       bInitStep, TRUE, bCalcVir, vetanew);
+                }
+            }
+        }
+
+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
+        {
+            gmx_iterate_init(&iterate, TRUE);
+            /* for iterations, we save these vectors, as we will be redoing the calculations */
+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+        }
+
+        bFirstIterate = TRUE;
+        while (bFirstIterate || iterate.bIterationActive)
+        {
+            /* We now restore these vectors to redo the calculation with improved extended variables */
+            if (iterate.bIterationActive)
+            {
+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+            }
+
+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+               so scroll down for that logic */
+
+            /* #########   START SECOND UPDATE STEP ################# */
+            GMX_MPE_LOG(ev_update_start);
+            /* Box is changed in update() when we do pressure coupling,
+             * but we should still use the old box for energy corrections and when
+             * writing it to the energy file, so it matches the trajectory files for
+             * the same timestep above. Make a copy in a separate array.
+             */
+            copy_mat(state->box, lastbox);
+
+            bOK = TRUE;
+            dvdl_constr = 0;
+
+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+            {
+                wallcycle_start(wcycle, ewcUPDATE);
+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+                if (bTrotter)
+                {
+                    if (iterate.bIterationActive)
+                    {
+                        if (bFirstIterate)
+                        {
+                            scalevir = 1;
+                        }
+                        else
+                        {
+                            /* we use a new value of scalevir to converge the iterations faster */
+                            scalevir = tracevir/trace(shake_vir);
+                        }
+                        msmul(shake_vir, scalevir, shake_vir);
+                        m_add(force_vir, shake_vir, total_vir);
+                        clear_mat(shake_vir);
+                    }
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                    /* We can only do Berendsen coupling after we have summed
+                     * the kinetic energy or virial. Since the happens
+                     * in global_state after update, we should only do it at
+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                     */
+                }
+                else
+                {
+                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
+                                   upd, bInitStep);
+                }
+
+                if (bVV)
+                {
+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                    /* velocity half-step update */
+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
+                                  cr, nrnb, constr, &top->idef);
+                }
+
+                /* Above, initialize just copies ekinh into ekin,
+                 * it doesn't copy position (for VV),
+                 * and entire integrator for MD.
+                 */
+
+                if (ir->eI == eiVVAK)
+                {
+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
+                }
+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                              bUpdateDoLR, fr->f_twin, fcd,
+                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
+                                   fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir, force_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   bInitStep, FALSE, bCalcVir, state->veta);
+
+                if (ir->eI == eiVVAK)
+                {
+                    /* erase F_EKIN and F_TEMP here? */
+                    /* just compute the kinetic energy at the half step to perform a trotter step */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                    constr, NULL, FALSE, lastbox,
+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                    cglo_flags | CGLO_TEMPERATURE
+                                    );
+                    wallcycle_start(wcycle, ewcUPDATE);
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                    /* now we know the scaling, we can compute the positions again again */
+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
+
+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+                    wallcycle_stop(wcycle, ewcUPDATE);
+
+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                    /* are the small terms in the shake_vir here due
+                     * to numerical errors, or are they important
+                     * physically? I'm thinking they are just errors, but not completely sure.
+                     * For now, will call without actually constraining, constr=NULL*/
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, tmp_vir, force_vir,
+                                       cr, nrnb, wcycle, upd, NULL,
+                                       bInitStep, FALSE, bCalcVir,
+                                       state->veta);
+                }
+                if (!bOK && !bFFscan)
+                {
+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+                }
+
+                if (fr->bSepDVDL && fplog && do_log)
+                {
+                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
+                }
+                if (bVV)
+                {
+                    /* this factor or 2 correction is necessary
+                       because half of the constraint force is removed
+                       in the vv step, so we have to double it.  See
+                       the Redmine issue #1255.  It is not yet clear
+                       if the factor of 2 is exact, or just a very
+                       good approximation, and this will be
+                       investigated.  The next step is to see if this
+                       can be done adding a dhdl contribution from the
+                       rattle step, but this is somewhat more
+                       complicated with the current code. Will be
+                       investigated, hopefully for 4.6.3. However,
+                       this current solution is much better than
+                       having it completely wrong.
+                    */
+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+                }
+                else
+                {
+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+                }
+            }
+            else if (graph)
+            {
+                /* Need to unshift here */
+                unshift_self(graph, state->box, state->x);
+            }
+
+            GMX_BARRIER(cr->mpi_comm_mygroup);
+            GMX_MPE_LOG(ev_update_finish);
+
+            if (vsite != NULL)
+            {
+                wallcycle_start(wcycle, ewcVSITECONSTR);
+                if (graph != NULL)
+                {
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+
+                if (graph != NULL)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+                wallcycle_stop(wcycle, ewcVSITECONSTR);
+            }
+
+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
+            /* With Leap-Frog we can skip compute_globals at
+             * non-communication steps, but we need to calculate
+             * the kinetic energy one step before communication.
+             */
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+            {
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
+                }
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr,
+                                bFirstIterate ? &gs : NULL,
+                                (step_rel % gs.nstms == 0) &&
+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                                lastbox,
+                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                cglo_flags
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+                                | CGLO_CONSTRAINT
+                                );
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    nlh.nabnsb         = gs.set[eglsNABNSB];
+                    gs.set[eglsNABNSB] = 0;
+                }
+            }
+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+            /* #############  END CALC EKIN AND PRESSURE ################# */
+
+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+               the virial that should probably be addressed eventually. state->veta has better properies,
+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+            if (iterate.bIterationActive &&
+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+                               trace(shake_vir), &tracevir))
+            {
+                break;
+            }
+            bFirstIterate = FALSE;
+        }
+
+        if (!bVV || bRerunMD)
+        {
+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+        update_box(fplog, step, ir, mdatoms, state, graph, f,
+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (bFFscan && (shellfc == NULL || bConverged))
+        {
+            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
+                                 f, NULL, xcopy,
+                                 &(top_global->mols), mdatoms->massT, pres))
+            {
+                gmx_finalize_par();
+
+                fprintf(stderr, "\n");
+                exit(0);
+            }
+        }
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bTCR)
+        {
+            /* Only do GCT when the relaxation of shells (minimization) has converged,
+             * otherwise we might be coupling to bogus energies.
+             * In parallel we must always do this, because the other sims might
+             * update the FF.
+             */
+
+            /* Since this is called with the new coordinates state->x, I assume
+             * we want the new box state->box too. / EL 20040121
+             */
+            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
+                        ir, MASTER(cr),
+                        mdatoms, &(top->idef), mu_aver,
+                        top_global->mols.nr, cr,
+                        state->box, total_vir, pres,
+                        mu_tot, state->x, f, bConverged);
+            debug_gmx();
+        }
+
+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+        /* use the directly determined last velocity, not actually the averaged half steps */
+        if (bTrotter && ir->eI == eiVV)
+        {
+            enerd->term[F_EKIN] = last_ekin;
+        }
+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+        if (bVV)
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+        }
+        else
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+        }
+        /* Check for excessively large energies */
+        if (bIonize)
+        {
+#ifdef GMX_DOUBLE
+            real etot_max = 1e200;
+#else
+            real etot_max = 1e30;
+#endif
+            if (fabs(enerd->term[F_ETOT]) > etot_max)
+            {
+                fprintf(stderr, "Energy too large (%g), giving up\n",
+                        enerd->term[F_ETOT]);
+            }
+        }
+        /* #########  END PREPARING EDR OUTPUT  ###########  */
+
+        /* Time for performance */
+        if (((step % stepout) == 0) || bLastStep)
+        {
+            runtime_upd_proc(runtime);
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            gmx_bool do_dr, do_or;
+
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+                                          &df_history, state->fep_state, ir->nstlog, step);
+            }
+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
+            {
+                if (bCalcEner)
+                {
+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
+                               t, mdatoms->tmass, enerd, state,
+                               ir->fepvals, ir->expandedvals, lastbox,
+                               shake_vir, force_vir, total_vir, pres,
+                               ekind, mu_tot, constr);
+                }
+                else
+                {
+                    upd_mdebin_step(mdebin);
+                }
+
+                do_dr  = do_per_step(step, ir->nstdisreout);
+                do_or  = do_per_step(step, ir->nstorireout);
+
+                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
+                           step, t,
+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
+            }
+            if (ir->ePull != epullNO)
+            {
+                pull_print_output(ir->pull, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part after outputting the logfile and the edr file */
+            state->fep_state = lamnew;
+            for (i = 0; i < efptNR; i++)
+            {
+                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+            }
+        }
+        /* Remaining runtime */
+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, runtime, step, ir, cr);
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+            do_per_step(step, repl_ex_nst))
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+
+            if (bExchanged && DOMAINDECOMP(cr))
+            {
+                dd_partition_system(fplog, step, cr, TRUE, 1,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, shellfc, constr,
+                                    nrnb, wcycle, FALSE);
+            }
+        }
+
+        bFirstStep       = FALSE;
+        bInitStep        = FALSE;
+        bStartingFromCpt = FALSE;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != NULL) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, state_global->x);
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+            }
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (bPMETuneRunning || bPMETuneTry)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+
+            /* Count the total cycles over the last steps */
+            cycles_pmes += cycles;
+
+            /* We can only switch cut-off at NS steps */
+            if (step % ir->nstlist == 0)
+            {
+                /* PME grid + cut-off optimization with GPUs or PME nodes */
+                if (bPMETuneTry)
+                {
+                    if (DDMASTER(cr->dd))
+                    {
+                        /* PME node load is too high, start tuning */
+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+                    }
+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+
+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
+                    {
+                        bPMETuneTry     = FALSE;
+                    }
+                }
+                if (bPMETuneRunning)
+                {
+                    /* init_step might not be a multiple of nstlist,
+                     * but the first cycle is always skipped anyhow.
+                     */
+                    bPMETuneRunning =
+                        pme_load_balance(pme_loadbal, cr,
+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
+                                         fplog,
+                                         ir, state, cycles_pmes,
+                                         fr->ic, fr->nbv, &fr->pmedata,
+                                         step);
+
+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
+                    fr->rlist      = fr->ic->rlist;
+                    fr->rlistlong  = fr->ic->rlistlong;
+                    fr->rcoulomb   = fr->ic->rcoulomb;
+                    fr->rvdw       = fr->ic->rvdw;
+                }
+                cycles_pmes = 0;
+            }
+        }
+
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            gs.set[eglsRESETCOUNTERS] != 0)
+        {
+            /* Reset all the counters related to performance over the run */
+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!(cr->duty & DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= run_time/(60.0*60.0);
+            bResetCountersHalfMaxH    = FALSE;
+            gs.set[eglsRESETCOUNTERS] = 0;
+        }
+
+    }
+    /* End of main MD loop */
+    debug_gmx();
+
+    /* Stop the time */
+    runtime_end(runtime);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trj(status);
+    }
+
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
+        }
+    }
+
+    done_mdoutf(outf);
+
+    debug_gmx();
+
+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+    {
+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
+    }
+
+    if (pme_loadbal != NULL)
+    {
+        pme_loadbal_done(pme_loadbal, cr, fplog,
+                         fr->nbv != NULL && fr->nbv->bUseGPU);
+    }
+
+    if (shellfc && fplog)
+    {
+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
+                (nconverged*100.0)/step_rel);
+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
+                tcount/step_rel);
+    }
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    runtime->nsteps_done = step_rel;
+
+    return 0;
+}
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed b/patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..f82e6974ca5095ff0eb4b7268ddf9b63798ca64d
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed
@@ -0,0 +1,2260 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "vec.h"
+#include "statutil.h"
+#include "vcm.h"
+#include "mdebin.h"
+#include "nrnb.h"
+#include "calcmu.h"
+#include "index.h"
+#include "vsite.h"
+#include "update.h"
+#include "ns.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "mdrun.h"
+#include "md_support.h"
+#include "md_logging.h"
+#include "confio.h"
+#include "network.h"
+#include "pull.h"
+#include "xvgr.h"
+#include "physics.h"
+#include "names.h"
+#include "xmdrun.h"
+#include "ionize.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "mpelogging.h"
+#include "domdec.h"
+#include "domdec_network.h"
+#include "partdec.h"
+#include "topsort.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "shellfc.h"
+#include "compute_io.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "string2.h"
+#include "pme_loadbal.h"
+#include "bondf.h"
+#include "membed.h"
+#include "types/nlistheuristics.h"
+#include "types/iteratedconstraints.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+#ifdef GMX_LIB_MPI
+#include <mpi.h>
+#endif
+#ifdef GMX_THREAD_MPI
+#include "tmpi.h"
+#endif
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+static void reset_all_counters(FILE *fplog, t_commrec *cr,
+                               gmx_large_int_t step,
+                               gmx_large_int_t *step_rel, t_inputrec *ir,
+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+                               gmx_runtime_t *runtime,
+                               nbnxn_cuda_ptr_t cu_nbv)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+                  gmx_step_str(step, sbuf));
+
+    if (cu_nbv)
+    {
+        nbnxn_cuda_reset_timings(cu_nbv);
+    }
+
+    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel      = 0;
+    wallcycle_start(wcycle, ewcRUN);
+    runtime_start(runtime);
+    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
+}
+
+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
+             int nstglobalcomm,
+             gmx_vsite_t *vsite, gmx_constr_t constr,
+             int stepout, t_inputrec *ir,
+             gmx_mtop_t *top_global,
+             t_fcdata *fcd,
+             t_state *state_global,
+             t_mdatoms *mdatoms,
+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+             gmx_edsam_t ed, t_forcerec *fr,
+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
+             real cpt_period, real max_hours,
+             const char *deviceOptions,
+             unsigned long Flags,
+             gmx_runtime_t *runtime)
+{
+    gmx_mdoutf_t   *outf;
+    gmx_large_int_t step, step_rel;
+    double          run_time;
+    double          t, t0, lam0[efptNR];
+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
+                    bBornRadii, bStartingFromCpt;
+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+                      bForceUpdate = FALSE, bCPT;
+    int               mdof_flags;
+    gmx_bool          bMasterState;
+    int               force_flags, cglo_flags;
+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+    int               i, m;
+    t_trxstatus      *status;
+    rvec              mu_tot;
+    t_vcm            *vcm;
+    t_state          *bufstate = NULL;
+    matrix           *scale_tot, pcoupl_mu, M, ebox;
+    gmx_nlheur_t      nlh;
+    t_trxframe        rerun_fr;
+    gmx_repl_ex_t     repl_ex = NULL;
+    int               nchkpt  = 1;
+    gmx_localtop_t   *top;
+    t_mdebin         *mdebin = NULL;
+    df_history_t      df_history;
+    t_state          *state    = NULL;
+    rvec             *f_global = NULL;
+    int               n_xtc    = -1;
+    rvec             *x_xtc    = NULL;
+    gmx_enerdata_t   *enerd;
+    rvec             *f = NULL;
+    gmx_global_stat_t gstat;
+    gmx_update_t      upd   = NULL;
+    t_graph          *graph = NULL;
+    globsig_t         gs;
+    gmx_rng_t         mcrng = NULL;
+    gmx_bool          bFFscan;
+    gmx_groups_t     *groups;
+    gmx_ekindata_t   *ekind, *ekind_save;
+    gmx_shellfc_t     shellfc;
+    int               count, nconverged = 0;
+    real              timestep = 0;
+    double            tcount   = 0;
+    gmx_bool          bIonize  = FALSE;
+    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
+    gmx_bool          bAppend;
+    gmx_bool          bResetCountersHalfMaxH = FALSE;
+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
+    gmx_bool          bUpdateDoLR;
+    real              mu_aver = 0, dvdl_constr;
+    int               a0, a1, gnx = 0, ii;
+    atom_id          *grpindex = NULL;
+    char             *grpname;
+    t_coupl_rec      *tcr     = NULL;
+    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
+    matrix            boxcopy = {{0}}, lastbox;
+    tensor            tmpvir;
+    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
+    real              vetanew = 0;
+    int               lamnew  = 0;
+    /* for FEP */
+    int               nstfep;
+    real              rate;
+    double            cycles;
+    real              saved_conserved_quantity = 0;
+    real              last_ekin                = 0;
+    int               iter_i;
+    t_extmass         MassQ;
+    int             **trotter_seq;
+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+    gmx_iterate_t     iterate;
+    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
+                                                                      simulation stops. If equal to zero, don't
+                                                                      communicate any more between multisims.*/
+    /* PME load balancing data for GPU kernels */
+    pme_load_balancing_t pme_loadbal = NULL;
+    double               cycles_pmes;
+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+
+#ifdef GMX_FAHCORE
+    /* Temporary addition for FAHCORE checkpointing */
+    int chkpt_ret;
+#endif
+
+    /* Check for special mdrun options */
+    bRerunMD = (Flags & MD_RERUN);
+    bIonize  = (Flags & MD_IONIZE);
+    bFFscan  = (Flags & MD_FFSCAN);
+    bAppend  = (Flags & MD_APPENDFILES);
+    if (Flags & MD_RESETCOUNTERSHALFWAY)
+    {
+        if (ir->nsteps > 0)
+        {
+            /* Signal to reset the counters half the simulation steps. */
+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+        }
+        /* Signal to reset the counters halfway the simulation time. */
+        bResetCountersHalfMaxH = (max_hours > 0);
+    }
+
+    /* md-vv uses averaged full step velocities for T-control
+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+    bVV = EI_VV(ir->eI);
+    if (bVV) /* to store the initial velocities while computing virial */
+    {
+        snew(cbuf, top_global->natoms);
+    }
+    /* all the iteratative cases - only if there are constraints */
+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
+                                          false in this step.  The correct value, true or false,
+                                          is set at each step, as it depends on the frequency of temperature
+                                          and pressure control.*/
+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+
+    if (bRerunMD)
+    {
+        /* Since we don't know if the frames read are related in any way,
+         * rebuild the neighborlist at every step.
+         */
+        ir->nstlist       = 1;
+        ir->nstcalcenergy = 1;
+        nstglobalcomm     = 1;
+    }
+
+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
+
+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+    bGStatEveryStep = (nstglobalcomm == 1);
+
+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+    {
+        fprintf(fplog,
+                "To reduce the energy communication with nstlist = -1\n"
+                "the neighbor list validity should not be checked at every step,\n"
+                "this means that exact integration is not guaranteed.\n"
+                "The neighbor list validity is checked after:\n"
+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
+                "In most cases this will result in exact integration.\n"
+                "This reduces the energy communication by a factor of 2 to 3.\n"
+                "If you want less energy communication, set nstlist > 3.\n\n");
+    }
+
+    if (bRerunMD || bFFscan)
+    {
+        ir->nstxtcout = 0;
+    }
+    groups = &top_global->groups;
+
+    /* Initial values */
+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+            &(state_global->fep_state), lam0,
+            nrnb, top_global, &upd,
+            nfile, fnm, &outf, &mdebin,
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
+
+    clear_mat(total_vir);
+    clear_mat(pres);
+    /* Energy terms and groups */
+    snew(enerd, 1);
+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+                  enerd);
+    if (DOMAINDECOMP(cr))
+    {
+        f = NULL;
+    }
+    else
+    {
+        snew(f, top_global->natoms);
+    }
+
+    /* lambda Monte carlo random number generator  */
+    if (ir->bExpanded)
+    {
+        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+    }
+    /* copy the state into df_history */
+    copy_df_history(&df_history, &state_global->dfhist);
+
+    /* Kinetic energy data */
+    snew(ekind, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+    /* needed for iteration of constraints */
+    snew(ekind_save, 1);
+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
+    /* Copy the cos acceleration to the groups struct */
+    ekind->cosacc.cos_accel = ir->cos_accel;
+
+    gstat = global_stat_init(ir);
+    debug_gmx();
+
+    /* Check for polarizable models and flexible constraints */
+    shellfc = init_shell_flexcon(fplog,
+                                 top_global, n_flexible_constraints(constr),
+                                 (ir->bContinuation ||
+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+                                 NULL : state_global->x);
+
+    if (DEFORM(*ir))
+    {
+#ifdef GMX_THREAD_MPI
+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+        set_deform_reference_box(upd,
+                                 deform_init_init_step_tpx,
+                                 deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+    }
+
+    {
+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+        if ((io > 2000) && MASTER(cr))
+        {
+            fprintf(stderr,
+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+                    io);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        top = dd_init_local_top(top_global);
+
+        snew(state, 1);
+        dd_init_local_state(cr->dd, state_global, state);
+
+        if (DDMASTER(cr->dd) && ir->nstfout)
+        {
+            snew(f_global, state_global->natoms);
+        }
+    }
+    else
+    {
+        if (PAR(cr))
+        {
+            /* Initialize the particle decomposition and split the topology */
+            top = split_system(fplog, top_global, ir, cr);
+
+            pd_cg_range(cr, &fr->cg0, &fr->hcg);
+            pd_at_range(cr, &a0, &a1);
+        }
+        else
+        {
+            top = gmx_mtop_generate_local_top(top_global, ir);
+
+            a0 = 0;
+            a1 = top_global->natoms;
+        }
+
+        forcerec_set_excl_load(fr, top, cr);
+
+        state    = partdec_init_local_state(cr, state_global);
+        f_global = f;
+
+        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
+
+        if (vsite)
+        {
+            set_vsite_top(vsite, top, mdatoms, cr);
+        }
+
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+        {
+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+        }
+
+        if (shellfc)
+        {
+            make_local_shells(cr, mdatoms, shellfc);
+        }
+
+        init_bonded_thread_force_reduction(fr, &top->idef);
+
+        if (ir->pull && PAR(cr))
+        {
+            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
+        }
+    }
+
+    if (DOMAINDECOMP(cr))
+    {
+        /* Distribute the charge groups over the nodes from the master node */
+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+                            state_global, top_global, ir,
+                            state, &f, mdatoms, top, fr,
+                            vsite, shellfc, constr,
+                            nrnb, wcycle, FALSE);
+
+    }
+
+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+    if (opt2bSet("-cpi", nfile, fnm))
+    {
+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
+    }
+    else
+    {
+        bStateFromCP = FALSE;
+    }
+
+    if (MASTER(cr))
+    {
+        if (bStateFromCP)
+        {
+            /* Update mdebin with energy history if appending to output files */
+            if (Flags & MD_APPENDFILES)
+            {
+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
+            }
+            else
+            {
+                /* We might have read an energy history from checkpoint,
+                 * free the allocated memory and reset the counts.
+                 */
+                done_energyhistory(&state_global->enerhist);
+                init_energyhistory(&state_global->enerhist);
+            }
+        }
+        /* Set the initial energy history in state by updating once */
+        update_energyhistory(&state_global->enerhist, mdebin);
+    }
+
+    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
+    {
+        /* Set the random state if we read a checkpoint file */
+        set_stochd_state(upd, state);
+    }
+
+    if (state->flags & (1<<estMC_RNG))
+    {
+        set_mc_state(mcrng, state);
+    }
+
+    /* Initialize constraints */
+    if (constr)
+    {
+        if (!DOMAINDECOMP(cr))
+        {
+            set_constraints(constr, top, ir, mdatoms, cr);
+        }
+    }
+
+    /* Check whether we have to GCT stuff */
+    bTCR = ftp2bSet(efGCT, nfile, fnm);
+    if (bTCR)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "Will do General Coupling Theory!\n");
+        }
+        gnx = top_global->mols.nr;
+        snew(grpindex, gnx);
+        for (i = 0; (i < gnx); i++)
+        {
+            grpindex[i] = i;
+        }
+    }
+
+    if (repl_ex_nst > 0)
+    {
+        /* We need to be sure replica exchange can only occur
+         * when the energies are current */
+        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
+                        "repl_ex_nst", &repl_ex_nst);
+        /* This check needs to happen before inter-simulation
+         * signals are initialized, too */
+    }
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+    }
+
+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
+     * With perturbed charges with soft-core we should not change the cut-off.
+     */
+    if ((Flags & MD_TUNEPME) &&
+        EEL_PME(fr->eeltype) &&
+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
+        !bRerunMD)
+    {
+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
+        cycles_pmes = 0;
+        if (cr->duty & DUTY_PME)
+        {
+            /* Start tuning right away, as we can't measure the load */
+            bPMETuneRunning = TRUE;
+        }
+        else
+        {
+            /* Separate PME nodes, we can measure the PP/PME load balance */
+            bPMETuneTry = TRUE;
+        }
+    }
+
+    if (!ir->bContinuation && !bRerunMD)
+    {
+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+        {
+            /* Set the velocities of frozen particles to zero */
+            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
+            {
+                for (m = 0; m < DIM; m++)
+                {
+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+                    {
+                        state->v[i][m] = 0;
+                    }
+                }
+            }
+        }
+
+        if (constr)
+        {
+            /* Constrain the initial coordinates and velocities */
+            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
+                               graph, cr, nrnb, fr, top, shake_vir);
+        }
+        if (vsite)
+        {
+            /* Construct the virtual sites for the initial configuration */
+            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+        }
+    }
+
+    debug_gmx();
+
+    /* set free energy calculation frequency as the minimum
+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
+    nstfep = ir->fepvals->nstdhdl;
+    if (ir->bExpanded)
+    {
+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
+    }
+    if (repl_ex_nst > 0)
+    {
+        nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
+    }
+
+    /* I'm assuming we need global communication the first time! MRS */
+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+                  | (bVV ? CGLO_PRESSURE : 0)
+                  | (bVV ? CGLO_CONSTRAINT : 0)
+                  | (bRerunMD ? CGLO_RERUNMD : 0)
+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+
+    bSumEkinhOld = FALSE;
+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                    constr, NULL, FALSE, state->box,
+                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
+    if (ir->eI == eiVVAK)
+    {
+        /* a second call to get the half step temperature initialized as well */
+        /* we do the same call as above, but turn the pressure off -- internally to
+           compute_globals, this is recognized as a velocity verlet half-step
+           kinetic energy calculation.  This minimized excess variables, but
+           perhaps loses some logic?*/
+
+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                        constr, NULL, FALSE, state->box,
+                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+    }
+
+    /* Calculate the initial half step temperature, and save the ekinh_old */
+    if (!(Flags & MD_STARTFROMCPT))
+    {
+        for (i = 0; (i < ir->opts.ngtc); i++)
+        {
+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+        }
+    }
+    if (ir->eI != eiVV)
+    {
+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+                                     and there is no previous step */
+    }
+
+    /* if using an iterative algorithm, we need to create a working directory for the state. */
+    if (bIterativeCase)
+    {
+        bufstate = init_bufstate(state);
+    }
+    if (bFFscan)
+    {
+        snew(xcopy, state->natoms);
+        snew(vcopy, state->natoms);
+        copy_rvecn(state->x, xcopy, 0, state->natoms);
+        copy_rvecn(state->v, vcopy, 0, state->natoms);
+        copy_mat(state->box, boxcopy);
+    }
+
+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+       temperature control */
+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+    if (MASTER(cr))
+    {
+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+        {
+            fprintf(fplog,
+                    "RMS relative constraint deviation after constraining: %.2e\n",
+                    constr_rmsd(constr, FALSE));
+        }
+        if (EI_STATE_VELOCITY(ir->eI))
+        {
+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+        }
+        if (bRerunMD)
+        {
+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+                    " input trajectory '%s'\n\n",
+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+            if (bVerbose)
+            {
+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+                        "run input file,\nwhich may not correspond to the time "
+                        "needed to process input trajectory.\n\n");
+            }
+        }
+        else
+        {
+            char tbuf[20];
+            fprintf(stderr, "starting mdrun '%s'\n",
+                    *(top_global->name));
+            if (ir->nsteps >= 0)
+            {
+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+            }
+            else
+            {
+                sprintf(tbuf, "%s", "infinite");
+            }
+            if (ir->init_step > 0)
+            {
+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+                        gmx_step_str(ir->init_step, sbuf2),
+                        ir->init_step*ir->delta_t);
+            }
+            else
+            {
+                fprintf(stderr, "%s steps, %s ps.\n",
+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+            }
+        }
+        fprintf(fplog, "\n");
+    }
+
+    /* Set and write start time */
+    runtime_start(runtime);
+    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
+    wallcycle_start(wcycle, ewcRUN);
+    if (fplog)
+    {
+        fprintf(fplog, "\n");
+    }
+
+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+                                      NULL, 0);
+    if (chkpt_ret == 0)
+    {
+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+    }
+#endif
+
+    debug_gmx();
+    /***********************************************************
+     *
+     *             Loop over MD steps
+     *
+     ************************************************************/
+
+    /* if rerunMD then read coordinates and velocities from input trajectory */
+    if (bRerunMD)
+    {
+        if (getenv("GMX_FORCE_UPDATE"))
+        {
+            bForceUpdate = TRUE;
+        }
+
+        rerun_fr.natoms = 0;
+        if (MASTER(cr))
+        {
+            bNotLastFrame = read_first_frame(oenv, &status,
+                                             opt2fn("-rerun", nfile, fnm),
+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
+            if (rerun_fr.natoms != top_global->natoms)
+            {
+                gmx_fatal(FARGS,
+                          "Number of atoms in trajectory (%d) does not match the "
+                          "run input file (%d)\n",
+                          rerun_fr.natoms, top_global->natoms);
+            }
+            if (ir->ePBC != epbcNONE)
+            {
+                if (!rerun_fr.bBox)
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+                }
+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
+                {
+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+                }
+            }
+        }
+
+        if (PAR(cr))
+        {
+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+        }
+
+        if (ir->ePBC != epbcNONE)
+        {
+            /* Set the shift vectors.
+             * Necessary here when have a static box different from the tpr box.
+             */
+            calc_shifts(rerun_fr.box, fr->shift_vec);
+        }
+    }
+
+    /* loop over MD steps or if rerunMD to end of input trajectory */
+    bFirstStep = TRUE;
+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+    bStateFromTPX    = !bStateFromCP;
+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+    bLastStep        = FALSE;
+    bSumEkinhOld     = FALSE;
+    bExchanged       = FALSE;
+
+    init_global_signals(&gs, cr, ir, repl_ex_nst);
+
+    step     = ir->init_step;
+    step_rel = 0;
+
+    if (ir->nstlist == -1)
+    {
+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
+    }
+
+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+    {
+        /* check how many steps are left in other sims */
+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+    }
+
+
+    /* and stop now if we should */
+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+    while (!bLastStep || (bRerunMD && bNotLastFrame))
+    {
+
+        wallcycle_start(wcycle, ewcSTEP);
+
+        GMX_MPE_LOG(ev_timestep1);
+
+        if (bRerunMD)
+        {
+            if (rerun_fr.bStep)
+            {
+                step     = rerun_fr.step;
+                step_rel = step - ir->init_step;
+            }
+            if (rerun_fr.bTime)
+            {
+                t = rerun_fr.time;
+            }
+            else
+            {
+                t = step;
+            }
+        }
+        else
+        {
+            bLastStep = (step_rel == ir->nsteps);
+            t         = t0 + step*ir->delta_t;
+        }
+
+        if (ir->efep != efepNO || ir->bSimTemp)
+        {
+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+               requiring different logic. */
+
+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+        }
+
+        if (bSimAnn)
+        {
+            update_annealing_target_temp(&(ir->opts), t);
+        }
+
+        if (bRerunMD)
+        {
+            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+            {
+                for (i = 0; i < state_global->natoms; i++)
+                {
+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
+                }
+                if (rerun_fr.bV)
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < state_global->natoms; i++)
+                    {
+                        clear_rvec(state_global->v[i]);
+                    }
+                    if (bRerunWarnNoV)
+                    {
+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+                                "         Ekin, temperature and pressure are incorrect,\n"
+                                "         the virial will be incorrect when constraints are present.\n"
+                                "\n");
+                        bRerunWarnNoV = FALSE;
+                    }
+                }
+            }
+            copy_mat(rerun_fr.box, state_global->box);
+            copy_mat(state_global->box, state->box);
+
+            if (vsite && (Flags & MD_RERUN_VSITE))
+            {
+                if (DOMAINDECOMP(cr))
+                {
+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+                }
+                if (graph)
+                {
+                    /* Following is necessary because the graph may get out of sync
+                     * with the coordinates if we only have every N'th coordinate set
+                     */
+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+                if (graph)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+            }
+        }
+
+        /* Stop Center of Mass motion */
+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+        /* Copy back starting coordinates in case we're doing a forcefield scan */
+        if (bFFscan)
+        {
+            for (ii = 0; (ii < state->natoms); ii++)
+            {
+                copy_rvec(xcopy[ii], state->x[ii]);
+                copy_rvec(vcopy[ii], state->v[ii]);
+            }
+            copy_mat(boxcopy, state->box);
+        }
+
+        if (bRerunMD)
+        {
+            /* for rerun MD always do Neighbour Searching */
+            bNS      = (bFirstStep || ir->nstlist != 0);
+            bNStList = bNS;
+        }
+        else
+        {
+            /* Determine whether or not to do Neighbour Searching and LR */
+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
+
+            if (bNS && ir->nstlist == -1)
+            {
+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
+            }
+        }
+
+        /* check whether we should stop because another simulation has
+           stopped. */
+        if (MULTISIM(cr))
+        {
+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
+                 (multisim_nsteps != ir->nsteps) )
+            {
+                if (bNS)
+                {
+                    if (MASTER(cr))
+                    {
+                        fprintf(stderr,
+                                "Stopping simulation %d because another one has finished\n",
+                                cr->ms->sim);
+                    }
+                    bLastStep         = TRUE;
+                    gs.sig[eglsCHKPT] = 1;
+                }
+            }
+        }
+
+        /* < 0 means stop at next step, > 0 means stop at next NS step */
+        if ( (gs.set[eglsSTOPCOND] < 0) ||
+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+        {
+            bLastStep = TRUE;
+        }
+
+        /* Determine whether or not to update the Born radii if doing GB */
+        bBornRadii = bFirstStep;
+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+        {
+            bBornRadii = TRUE;
+        }
+
+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
+        do_verbose = bVerbose &&
+            (step % stepout == 0 || bFirstStep || bLastStep);
+
+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+        {
+            if (bRerunMD)
+            {
+                bMasterState = TRUE;
+            }
+            else
+            {
+                bMasterState = FALSE;
+                /* Correct the new box if it is too skewed */
+                if (DYNAMIC_BOX(*ir))
+                {
+                    if (correct_box(fplog, step, state->box, graph))
+                    {
+                        bMasterState = TRUE;
+                    }
+                }
+                if (DOMAINDECOMP(cr) && bMasterState)
+                {
+                    dd_collect_state(cr->dd, state, state_global);
+                }
+            }
+
+            if (DOMAINDECOMP(cr))
+            {
+                /* Repartition the domain decomposition */
+                wallcycle_start(wcycle, ewcDOMDEC);
+                dd_partition_system(fplog, step, cr,
+                                    bMasterState, nstglobalcomm,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, shellfc, constr,
+                                    nrnb, wcycle,
+                                    do_verbose && !bPMETuneRunning);
+                wallcycle_stop(wcycle, ewcDOMDEC);
+                /* If using an iterative integrator, reallocate space to match the decomposition */
+            }
+        }
+
+        if (MASTER(cr) && do_log && !bFFscan)
+        {
+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
+        }
+
+        if (ir->efep != efepNO)
+        {
+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+        }
+
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+        {
+
+            /* We need the kinetic energy at minus the half step for determining
+             * the full step kinetic energy and possibly for T-coupling.*/
+            /* This may not be quite working correctly yet . . . . */
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                            constr, NULL, FALSE, state->box,
+                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+        }
+        clear_mat(force_vir);
+
+        /* Ionize the atoms if necessary */
+        if (bIonize)
+        {
+            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
+                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
+        }
+
+        /* Update force field in ffscan program */
+        if (bFFscan)
+        {
+            if (update_forcefield(fplog,
+                                  nfile, fnm, fr,
+                                  mdatoms->nr, state->x, state->box))
+            {
+                gmx_finalize_par();
+
+                exit(0);
+            }
+        }
+
+        GMX_MPE_LOG(ev_timestep2);
+
+        /* We write a checkpoint at this MD step when:
+         * either at an NS step when we signalled through gs,
+         * or at the last step (but not when we do not want confout),
+         * but never at the first step or with rerun.
+         */
+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+                 (bLastStep && (Flags & MD_CONFOUT))) &&
+                step > ir->init_step && !bRerunMD);
+        if (bCPT)
+        {
+            gs.set[eglsCHKPT] = 0;
+        }
+
+        /* Determine the energy and pressure:
+         * at nstcalcenergy steps and at energy output steps (set below).
+         */
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half of the integration actually corresponds
+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+               but the virial needs to be calculated on both the current step and the 'next' step. Future
+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
+            bCalcVir  = bCalcEner ||
+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+        }
+        else
+        {
+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
+            bCalcVir  = bCalcEner ||
+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+        }
+
+        /* Do we need global communication ? */
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+
+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+        if (do_ene || do_log)
+        {
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+            bGStat    = TRUE;
+        }
+
+        /* these CGLO_ options remain the same throughout the iteration */
+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+                      (bGStat ? CGLO_GSTAT : 0)
+                      );
+
+        force_flags = (GMX_FORCE_STATECHANGED |
+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+                       GMX_FORCE_ALLFORCES |
+                       GMX_FORCE_SEPLRF |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+                       );
+
+        if (fr->bTwinRange)
+        {
+            if (do_per_step(step, ir->nstcalclr))
+            {
+                force_flags |= GMX_FORCE_DO_LR;
+            }
+        }
+
+        if (shellfc)
+        {
+            /* Now is the time to relax the shells */
+            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
+                                        ir, bNS, force_flags,
+                                        bStopCM, top, top_global,
+                                        constr, enerd, fcd,
+                                        state, f, force_vir, mdatoms,
+                                        nrnb, wcycle, graph, groups,
+                                        shellfc, fr, bBornRadii, t, mu_tot,
+                                        state->natoms, &bConverged, vsite,
+                                        outf->fp_field);
+            tcount += count;
+
+            if (bConverged)
+            {
+                nconverged++;
+            }
+        }
+        else
+        {
+            /* The coordinates (x) are shifted (to get whole molecules)
+             * in do_force.
+             * This is parallellized as well, and does communication too.
+             * Check comments in sim_util.c
+             */
+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda, graph,
+                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
+        }
+
+        GMX_BARRIER(cr->mpi_comm_mygroup);
+
+        if (bTCR)
+        {
+            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
+                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
+        }
+
+        if (bTCR && bFirstStep)
+        {
+            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
+            fprintf(fplog, "Done init_coupling\n");
+            fflush(fplog);
+        }
+
+        if (bVV && !bStartingFromCpt && !bRerunMD)
+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+        {
+            if (ir->eI == eiVV && bInitStep)
+            {
+                /* if using velocity verlet with full time step Ekin,
+                 * take the first half step only to compute the
+                 * virial for the first step. From there,
+                 * revert back to the initial coordinates
+                 * so that the input is actually the initial step.
+                 */
+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+            }
+            else
+            {
+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+            }
+
+            /* If we are using twin-range interactions where the long-range component
+             * is only evaluated every nstcalclr>1 steps, we should do a special update
+             * step to combine the long-range forces on these steps.
+             * For nstcalclr=1 this is not done, since the forces would have been added
+             * directly to the short-range forces already.
+             */
+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
+                          f, bUpdateDoLR, fr->f_twin, fcd,
+                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
+                          cr, nrnb, constr, &top->idef);
+
+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
+            {
+                gmx_iterate_init(&iterate, TRUE);
+            }
+            /* for iterations, we save these vectors, as we will be self-consistently iterating
+               the calculations */
+
+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+
+            /* save the state */
+            if (iterate.bIterationActive)
+            {
+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+            }
+
+            bFirstIterate = TRUE;
+            while (bFirstIterate || iterate.bIterationActive)
+            {
+                if (iterate.bIterationActive)
+                {
+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+                    if (bFirstIterate && bTrotter)
+                    {
+                        /* The first time through, we need a decent first estimate
+                           of veta(t+dt) to compute the constraints.  Do
+                           this by computing the box volume part of the
+                           trotter integration at this time. Nothing else
+                           should be changed by this routine here.  If
+                           !(first time), we start with the previous value
+                           of veta.  */
+
+                        veta_save = state->veta;
+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
+                        vetanew     = state->veta;
+                        state->veta = veta_save;
+                    }
+                }
+
+                bOK = TRUE;
+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
+                {
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, shake_vir, NULL,
+                                       cr, nrnb, wcycle, upd, constr,
+                                       bInitStep, TRUE, bCalcVir, vetanew);
+
+                    if (!bOK && !bFFscan)
+                    {
+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+                    }
+
+                }
+                else if (graph)
+                {
+                    /* Need to unshift here if a do_force has been
+                       called in the previous step */
+                    unshift_self(graph, state->box, state->x);
+                }
+
+                /* if VV, compute the pressure and constraints */
+                /* For VV2, we strictly only need this if using pressure
+                 * control, but we really would like to have accurate pressures
+                 * printed out.
+                 * Think about ways around this in the future?
+                 * For now, keep this choice in comments.
+                 */
+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+                bPres = TRUE;
+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
+                {
+                    bSumEkinhOld = TRUE;
+                }
+                /* for vv, the first half of the integration actually corresponds to the previous step.
+                   So we need information from the last step in the first half of the integration */
+                if (bGStat || do_per_step(step-1, nstglobalcomm))
+                {
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                    constr, NULL, FALSE, state->box,
+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                    cglo_flags
+                                    | CGLO_ENERGY
+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
+                                    | (bPres ? CGLO_PRESSURE : 0)
+                                    | (bPres ? CGLO_CONSTRAINT : 0)
+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+                                    | CGLO_SCALEEKIN
+                                    );
+                    /* explanation of above:
+                       a) We compute Ekin at the full time step
+                       if 1) we are using the AveVel Ekin, and it's not the
+                       initial step, or 2) if we are using AveEkin, but need the full
+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                       EkinAveVel because it's needed for the pressure */
+                }
+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+                if (!bInitStep)
+                {
+                    if (bTrotter)
+                    {
+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+                    }
+                    else
+                    {
+                        if (bExchanged)
+                        {
+
+                            /* We need the kinetic energy at minus the half step for determining
+                             * the full step kinetic energy and possibly for T-coupling.*/
+                            /* This may not be quite working correctly yet . . . . */
+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                            constr, NULL, FALSE, state->box,
+                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        }
+                    }
+                }
+
+                if (iterate.bIterationActive &&
+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+                                   state->veta, &vetanew))
+                {
+                    break;
+                }
+                bFirstIterate = FALSE;
+            }
+
+            if (bTrotter && !bInitStep)
+            {
+                copy_mat(shake_vir, state->svir_prev);
+                copy_mat(force_vir, state->fvir_prev);
+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
+                {
+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
+                    enerd->term[F_EKIN] = trace(ekind->ekin);
+                }
+            }
+            /* if it's the initial step, we performed this first step just to get the constraint virial */
+            if (bInitStep && ir->eI == eiVV)
+            {
+                copy_rvecn(cbuf, state->v, 0, state->natoms);
+            }
+
+            GMX_MPE_LOG(ev_timestep1);
+        }
+
+        /* MRS -- now done iterating -- compute the conserved quantity */
+        if (bVV)
+        {
+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+            if (ir->eI == eiVV)
+            {
+                last_ekin = enerd->term[F_EKIN];
+            }
+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+            {
+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+            }
+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+            if (!bRerunMD)
+            {
+                sum_dhdl(enerd, state->lambda, ir->fepvals);
+            }
+        }
+
+        /* ########  END FIRST UPDATE STEP  ############## */
+        /* ########  If doing VV, we now have v(dt) ###### */
+        if (bDoExpanded)
+        {
+            /* perform extended ensemble sampling in lambda - we don't
+               actually move to the new state before outputting
+               statistics, but if performing simulated tempering, we
+               do update the velocities and the tau_t. */
+
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+        }
+        /* ################## START TRAJECTORY OUTPUT ################# */
+
+        /* Now we have the energies and forces corresponding to the
+         * coordinates at time t. We must output all of this before
+         * the update.
+         * for RerunMD t is read from input trajectory
+         */
+        GMX_MPE_LOG(ev_output_start);
+
+        mdof_flags = 0;
+        if (do_per_step(step, ir->nstxout))
+        {
+            mdof_flags |= MDOF_X;
+        }
+        if (do_per_step(step, ir->nstvout))
+        {
+            mdof_flags |= MDOF_V;
+        }
+        if (do_per_step(step, ir->nstfout))
+        {
+            mdof_flags |= MDOF_F;
+        }
+        if (do_per_step(step, ir->nstxtcout))
+        {
+            mdof_flags |= MDOF_XTC;
+        }
+        if (bCPT)
+        {
+            mdof_flags |= MDOF_CPT;
+        }
+        ;
+
+#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+        if (bLastStep)
+        {
+            /* Enforce writing positions and velocities at end of run */
+            mdof_flags |= (MDOF_X | MDOF_V);
+        }
+#endif
+#ifdef GMX_FAHCORE
+        if (MASTER(cr))
+        {
+            fcReportProgress( ir->nsteps, step );
+        }
+
+        /* sync bCPT and fc record-keeping */
+        if (bCPT && MASTER(cr))
+        {
+            fcRequestCheckPoint();
+        }
+#endif
+
+        if (mdof_flags != 0)
+        {
+            wallcycle_start(wcycle, ewcTRAJ);
+            if (bCPT)
+            {
+                if (state->flags & (1<<estLD_RNG))
+                {
+                    get_stochd_state(upd, state);
+                }
+                if (state->flags  & (1<<estMC_RNG))
+                {
+                    get_mc_state(mcrng, state);
+                }
+                if (MASTER(cr))
+                {
+                    if (bSumEkinhOld)
+                    {
+                        state_global->ekinstate.bUpToDate = FALSE;
+                    }
+                    else
+                    {
+                        update_ekinstate(&state_global->ekinstate, ekind);
+                        state_global->ekinstate.bUpToDate = TRUE;
+                    }
+                    update_energyhistory(&state_global->enerhist, mdebin);
+                    if (ir->efep != efepNO || ir->bSimTemp)
+                    {
+                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+                                                                       structured so this isn't necessary.
+                                                                       Note this reassignment is only necessary
+                                                                       for single threads.*/
+                        copy_df_history(&state_global->dfhist, &df_history);
+                    }
+                }
+            }
+            write_traj(fplog, cr, outf, mdof_flags, top_global,
+                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
+            if (bCPT)
+            {
+                nchkpt++;
+                bCPT = FALSE;
+            }
+            debug_gmx();
+            if (bLastStep && step_rel == ir->nsteps &&
+                (Flags & MD_CONFOUT) && MASTER(cr) &&
+                !bRerunMD && !bFFscan)
+            {
+                /* x and v have been collected in write_traj,
+                 * because a checkpoint file will always be written
+                 * at the last step.
+                 */
+                fprintf(stderr, "\nWriting final coordinates.\n");
+                if (fr->bMolPBC)
+                {
+                    /* Make molecules whole only for confout writing */
+                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
+                }
+                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
+                                    *top_global->name, top_global,
+                                    state_global->x, state_global->v,
+                                    ir->ePBC, state->box);
+                debug_gmx();
+            }
+            wallcycle_stop(wcycle, ewcTRAJ);
+        }
+        GMX_MPE_LOG(ev_output_finish);
+
+        /* kludge -- virial is lost with restart for NPT control. Must restart */
+        if (bStartingFromCpt && bVV)
+        {
+            copy_mat(state->svir_prev, shake_vir);
+            copy_mat(state->fvir_prev, force_vir);
+        }
+        /*  ################## END TRAJECTORY OUTPUT ################ */
+
+        /* Determine the wallclock run time up till now */
+        run_time = gmx_gettime() - (double)runtime->real;
+
+        /* Check whether everything is still allright */
+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#ifdef GMX_THREAD_MPI
+            && MASTER(cr)
+#endif
+            )
+        {
+            /* this is just make gs.sig compatible with the hack
+               of sending signals around by MPI_Reduce with together with
+               other floats */
+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+            {
+                gs.sig[eglsSTOPCOND] = 1;
+            }
+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
+            {
+                gs.sig[eglsSTOPCOND] = -1;
+            }
+            /* < 0 means stop at next step, > 0 means stop at next NS step */
+            if (fplog)
+            {
+                fprintf(fplog,
+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+                        gmx_get_signal_name(),
+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+                fflush(fplog);
+            }
+            fprintf(stderr,
+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+                    gmx_get_signal_name(),
+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+            fflush(stderr);
+            handled_stop_condition = (int)gmx_get_stop_condition();
+        }
+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+        {
+            /* Signal to terminate the run */
+            gs.sig[eglsSTOPCOND] = 1;
+            if (fplog)
+            {
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            }
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+        }
+
+        if (bResetCountersHalfMaxH && MASTER(cr) &&
+            run_time > max_hours*60.0*60.0*0.495)
+        {
+            gs.sig[eglsRESETCOUNTERS] = 1;
+        }
+
+        if (ir->nstlist == -1 && !bRerunMD)
+        {
+            /* When bGStatEveryStep=FALSE, global_stat is only called
+             * when we check the atom displacements, not at NS steps.
+             * This means that also the bonded interaction count check is not
+             * performed immediately after NS. Therefore a few MD steps could
+             * be performed with missing interactions.
+             * But wrong energies are never written to file,
+             * since energies are only written after global_stat
+             * has been called.
+             */
+            if (step >= nlh.step_nscheck)
+            {
+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
+                                                     nlh.scale_tot, state->x);
+            }
+            else
+            {
+                /* This is not necessarily true,
+                 * but step_nscheck is determined quite conservatively.
+                 */
+                nlh.nabnsb = 0;
+            }
+        }
+
+        /* In parallel we only have to check for checkpointing in steps
+         * where we do global communication,
+         *  otherwise the other nodes don't know.
+         */
+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+                           cpt_period >= 0 &&
+                           (cpt_period == 0 ||
+                            run_time >= nchkpt*cpt_period*60.0)) &&
+            gs.set[eglsCHKPT] == 0)
+        {
+            gs.sig[eglsCHKPT] = 1;
+        }
+
+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+        if (EI_VV(ir->eI))
+        {
+            if (!bInitStep)
+            {
+                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+            }
+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+            {
+                gmx_bool bIfRandomize;
+                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+                if (constr && bIfRandomize)
+                {
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, tmp_vir, NULL,
+                                       cr, nrnb, wcycle, upd, constr,
+                                       bInitStep, TRUE, bCalcVir, vetanew);
+                }
+            }
+        }
+
+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
+        {
+            gmx_iterate_init(&iterate, TRUE);
+            /* for iterations, we save these vectors, as we will be redoing the calculations */
+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+        }
+
+        bFirstIterate = TRUE;
+        while (bFirstIterate || iterate.bIterationActive)
+        {
+            /* We now restore these vectors to redo the calculation with improved extended variables */
+            if (iterate.bIterationActive)
+            {
+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+            }
+
+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+               so scroll down for that logic */
+
+            /* #########   START SECOND UPDATE STEP ################# */
+            GMX_MPE_LOG(ev_update_start);
+            /* Box is changed in update() when we do pressure coupling,
+             * but we should still use the old box for energy corrections and when
+             * writing it to the energy file, so it matches the trajectory files for
+             * the same timestep above. Make a copy in a separate array.
+             */
+            copy_mat(state->box, lastbox);
+
+            bOK = TRUE;
+            dvdl_constr = 0;
+
+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+            {
+                wallcycle_start(wcycle, ewcUPDATE);
+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+                if (bTrotter)
+                {
+                    if (iterate.bIterationActive)
+                    {
+                        if (bFirstIterate)
+                        {
+                            scalevir = 1;
+                        }
+                        else
+                        {
+                            /* we use a new value of scalevir to converge the iterations faster */
+                            scalevir = tracevir/trace(shake_vir);
+                        }
+                        msmul(shake_vir, scalevir, shake_vir);
+                        m_add(force_vir, shake_vir, total_vir);
+                        clear_mat(shake_vir);
+                    }
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                    /* We can only do Berendsen coupling after we have summed
+                     * the kinetic energy or virial. Since the happens
+                     * in global_state after update, we should only do it at
+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                     */
+                }
+                else
+                {
+                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
+                                   upd, bInitStep);
+                }
+
+                if (bVV)
+                {
+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                    /* velocity half-step update */
+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
+                                  cr, nrnb, constr, &top->idef);
+                }
+
+                /* Above, initialize just copies ekinh into ekin,
+                 * it doesn't copy position (for VV),
+                 * and entire integrator for MD.
+                 */
+
+                if (ir->eI == eiVVAK)
+                {
+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
+                }
+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                              bUpdateDoLR, fr->f_twin, fcd,
+                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+                wallcycle_stop(wcycle, ewcUPDATE);
+
+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
+                                   fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir, force_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   bInitStep, FALSE, bCalcVir, state->veta);
+
+                if (ir->eI == eiVVAK)
+                {
+                    /* erase F_EKIN and F_TEMP here? */
+                    /* just compute the kinetic energy at the half step to perform a trotter step */
+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                    constr, NULL, FALSE, lastbox,
+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                    cglo_flags | CGLO_TEMPERATURE
+                                    );
+                    wallcycle_start(wcycle, ewcUPDATE);
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                    /* now we know the scaling, we can compute the positions again again */
+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
+
+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+                    wallcycle_stop(wcycle, ewcUPDATE);
+
+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                    /* are the small terms in the shake_vir here due
+                     * to numerical errors, or are they important
+                     * physically? I'm thinking they are just errors, but not completely sure.
+                     * For now, will call without actually constraining, constr=NULL*/
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                                       state, fr->bMolPBC, graph, f,
+                                       &top->idef, tmp_vir, force_vir,
+                                       cr, nrnb, wcycle, upd, NULL,
+                                       bInitStep, FALSE, bCalcVir,
+                                       state->veta);
+                }
+                if (!bOK && !bFFscan)
+                {
+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+                }
+
+                if (fr->bSepDVDL && fplog && do_log)
+                {
+                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
+                }
+                if (bVV)
+                {
+                    /* this factor or 2 correction is necessary
+                       because half of the constraint force is removed
+                       in the vv step, so we have to double it.  See
+                       the Redmine issue #1255.  It is not yet clear
+                       if the factor of 2 is exact, or just a very
+                       good approximation, and this will be
+                       investigated.  The next step is to see if this
+                       can be done adding a dhdl contribution from the
+                       rattle step, but this is somewhat more
+                       complicated with the current code. Will be
+                       investigated, hopefully for 4.6.3. However,
+                       this current solution is much better than
+                       having it completely wrong.
+                    */
+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+                }
+                else
+                {
+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+                }
+            }
+            else if (graph)
+            {
+                /* Need to unshift here */
+                unshift_self(graph, state->box, state->x);
+            }
+
+            GMX_BARRIER(cr->mpi_comm_mygroup);
+            GMX_MPE_LOG(ev_update_finish);
+
+            if (vsite != NULL)
+            {
+                wallcycle_start(wcycle, ewcVSITECONSTR);
+                if (graph != NULL)
+                {
+                    shift_self(graph, state->box, state->x);
+                }
+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+                                 top->idef.iparams, top->idef.il,
+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+
+                if (graph != NULL)
+                {
+                    unshift_self(graph, state->box, state->x);
+                }
+                wallcycle_stop(wcycle, ewcVSITECONSTR);
+            }
+
+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
+            /* With Leap-Frog we can skip compute_globals at
+             * non-communication steps, but we need to calculate
+             * the kinetic energy one step before communication.
+             */
+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+            {
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
+                }
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr,
+                                bFirstIterate ? &gs : NULL,
+                                (step_rel % gs.nstms == 0) &&
+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                                lastbox,
+                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+                                cglo_flags
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+                                | CGLO_CONSTRAINT
+                                );
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    nlh.nabnsb         = gs.set[eglsNABNSB];
+                    gs.set[eglsNABNSB] = 0;
+                }
+            }
+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+            /* #############  END CALC EKIN AND PRESSURE ################# */
+
+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+               the virial that should probably be addressed eventually. state->veta has better properies,
+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+            if (iterate.bIterationActive &&
+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+                               trace(shake_vir), &tracevir))
+            {
+                break;
+            }
+            bFirstIterate = FALSE;
+        }
+
+        if (!bVV || bRerunMD)
+        {
+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+            sum_dhdl(enerd, state->lambda, ir->fepvals);
+        }
+        update_box(fplog, step, ir, mdatoms, state, graph, f,
+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
+
+        /* ################# END UPDATE STEP 2 ################# */
+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+
+        /* The coordinates (x) were unshifted in update */
+        if (bFFscan && (shellfc == NULL || bConverged))
+        {
+            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
+                                 f, NULL, xcopy,
+                                 &(top_global->mols), mdatoms->massT, pres))
+            {
+                gmx_finalize_par();
+
+                fprintf(stderr, "\n");
+                exit(0);
+            }
+        }
+        if (!bGStat)
+        {
+            /* We will not sum ekinh_old,
+             * so signal that we still have to do it.
+             */
+            bSumEkinhOld = TRUE;
+        }
+
+        if (bTCR)
+        {
+            /* Only do GCT when the relaxation of shells (minimization) has converged,
+             * otherwise we might be coupling to bogus energies.
+             * In parallel we must always do this, because the other sims might
+             * update the FF.
+             */
+
+            /* Since this is called with the new coordinates state->x, I assume
+             * we want the new box state->box too. / EL 20040121
+             */
+            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
+                        ir, MASTER(cr),
+                        mdatoms, &(top->idef), mu_aver,
+                        top_global->mols.nr, cr,
+                        state->box, total_vir, pres,
+                        mu_tot, state->x, f, bConverged);
+            debug_gmx();
+        }
+
+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+
+        /* use the directly determined last velocity, not actually the averaged half steps */
+        if (bTrotter && ir->eI == eiVV)
+        {
+            enerd->term[F_EKIN] = last_ekin;
+        }
+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+        if (bVV)
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+        }
+        else
+        {
+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+        }
+        /* Check for excessively large energies */
+        if (bIonize)
+        {
+#ifdef GMX_DOUBLE
+            real etot_max = 1e200;
+#else
+            real etot_max = 1e30;
+#endif
+            if (fabs(enerd->term[F_ETOT]) > etot_max)
+            {
+                fprintf(stderr, "Energy too large (%g), giving up\n",
+                        enerd->term[F_ETOT]);
+            }
+        }
+        /* #########  END PREPARING EDR OUTPUT  ###########  */
+
+        /* Time for performance */
+        if (((step % stepout) == 0) || bLastStep)
+        {
+            runtime_upd_proc(runtime);
+        }
+
+        /* Output stuff */
+        if (MASTER(cr))
+        {
+            gmx_bool do_dr, do_or;
+
+            if (fplog && do_log && bDoExpanded)
+            {
+                /* only needed if doing expanded ensemble */
+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+                                          &df_history, state->fep_state, ir->nstlog, step);
+            }
+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
+            {
+                if (bCalcEner)
+                {
+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
+                               t, mdatoms->tmass, enerd, state,
+                               ir->fepvals, ir->expandedvals, lastbox,
+                               shake_vir, force_vir, total_vir, pres,
+                               ekind, mu_tot, constr);
+                }
+                else
+                {
+                    upd_mdebin_step(mdebin);
+                }
+
+                do_dr  = do_per_step(step, ir->nstdisreout);
+                do_or  = do_per_step(step, ir->nstorireout);
+
+                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
+                           step, t,
+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
+            }
+            if (ir->ePull != epullNO)
+            {
+                pull_print_output(ir->pull, step, t);
+            }
+
+            if (do_per_step(step, ir->nstlog))
+            {
+                if (fflush(fplog) != 0)
+                {
+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+                }
+            }
+        }
+        if (bDoExpanded)
+        {
+            /* Have to do this part after outputting the logfile and the edr file */
+            state->fep_state = lamnew;
+            for (i = 0; i < efptNR; i++)
+            {
+                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+            }
+        }
+        /* Remaining runtime */
+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        {
+            if (shellfc)
+            {
+                fprintf(stderr, "\n");
+            }
+            print_time(stderr, runtime, step, ir, cr);
+        }
+
+        /* Replica exchange */
+        bExchanged = FALSE;
+        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+            do_per_step(step, repl_ex_nst))
+        {
+            bExchanged = replica_exchange(fplog, cr, repl_ex,
+                                          state_global, enerd,
+                                          state, step, t);
+
+            if (bExchanged && DOMAINDECOMP(cr))
+            {
+                dd_partition_system(fplog, step, cr, TRUE, 1,
+                                    state_global, top_global, ir,
+                                    state, &f, mdatoms, top, fr,
+                                    vsite, shellfc, constr,
+                                    nrnb, wcycle, FALSE);
+            }
+        }
+
+        bFirstStep       = FALSE;
+        bInitStep        = FALSE;
+        bStartingFromCpt = FALSE;
+
+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+        /* With all integrators, except VV, we need to retain the pressure
+         * at the current step for coupling at the next step.
+         */
+        if ((state->flags & (1<<estPRES_PREV)) &&
+            (bGStatEveryStep ||
+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+        {
+            /* Store the pressure in t_state for pressure coupling
+             * at the next MD step.
+             */
+            copy_mat(pres, state->pres_prev);
+        }
+
+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+
+        if ( (membed != NULL) && (!bLastStep) )
+        {
+            rescale_membed(step_rel, membed, state_global->x);
+        }
+
+        if (bRerunMD)
+        {
+            if (MASTER(cr))
+            {
+                /* read next frame from input trajectory */
+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
+            }
+
+            if (PAR(cr))
+            {
+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+            }
+        }
+
+        if (!bRerunMD || !rerun_fr.bStep)
+        {
+            /* increase the MD step number */
+            step++;
+            step_rel++;
+        }
+
+        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        if (DOMAINDECOMP(cr) && wcycle)
+        {
+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+        }
+
+        if (bPMETuneRunning || bPMETuneTry)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+
+            /* Count the total cycles over the last steps */
+            cycles_pmes += cycles;
+
+            /* We can only switch cut-off at NS steps */
+            if (step % ir->nstlist == 0)
+            {
+                /* PME grid + cut-off optimization with GPUs or PME nodes */
+                if (bPMETuneTry)
+                {
+                    if (DDMASTER(cr->dd))
+                    {
+                        /* PME node load is too high, start tuning */
+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+                    }
+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+
+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
+                    {
+                        bPMETuneTry     = FALSE;
+                    }
+                }
+                if (bPMETuneRunning)
+                {
+                    /* init_step might not be a multiple of nstlist,
+                     * but the first cycle is always skipped anyhow.
+                     */
+                    bPMETuneRunning =
+                        pme_load_balance(pme_loadbal, cr,
+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
+                                         fplog,
+                                         ir, state, cycles_pmes,
+                                         fr->ic, fr->nbv, &fr->pmedata,
+                                         step);
+
+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
+                    fr->rlist      = fr->ic->rlist;
+                    fr->rlistlong  = fr->ic->rlistlong;
+                    fr->rcoulomb   = fr->ic->rcoulomb;
+                    fr->rvdw       = fr->ic->rvdw;
+                }
+                cycles_pmes = 0;
+            }
+        }
+
+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+            gs.set[eglsRESETCOUNTERS] != 0)
+        {
+            /* Reset all the counters related to performance over the run */
+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+            wcycle_set_reset_counters(wcycle, -1);
+            if (!(cr->duty & DUTY_PME))
+            {
+                /* Tell our PME node to reset its counters */
+                gmx_pme_send_resetcounters(cr, step);
+            }
+            /* Correct max_hours for the elapsed time */
+            max_hours                -= run_time/(60.0*60.0);
+            bResetCountersHalfMaxH    = FALSE;
+            gs.set[eglsRESETCOUNTERS] = 0;
+        }
+
+    }
+    /* End of main MD loop */
+    debug_gmx();
+
+    /* Stop the time */
+    runtime_end(runtime);
+
+    if (bRerunMD && MASTER(cr))
+    {
+        close_trj(status);
+    }
+
+    if (!(cr->duty & DUTY_PME))
+    {
+        /* Tell the PME only node to finish */
+        gmx_pme_send_finish(cr);
+    }
+
+    if (MASTER(cr))
+    {
+        if (ir->nstcalcenergy > 0 && !bRerunMD)
+        {
+            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
+        }
+    }
+
+    done_mdoutf(outf);
+
+    debug_gmx();
+
+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+    {
+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
+    }
+
+    if (pme_loadbal != NULL)
+    {
+        pme_loadbal_done(pme_loadbal, cr, fplog,
+                         fr->nbv != NULL && fr->nbv->bUseGPU);
+    }
+
+    if (shellfc && fplog)
+    {
+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
+                (nconverged*100.0)/step_rel);
+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
+                tcount/step_rel);
+    }
+
+    if (repl_ex_nst > 0 && MASTER(cr))
+    {
+        print_replica_exchange_statistics(fplog, repl_ex);
+    }
+
+    runtime->nsteps_done = step_rel;
+
+    return 0;
+}
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c b/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c
new file mode 100644
index 0000000000000000000000000000000000000000..ba301007d08d9dcc0eeef4d656fb31b686e40b46
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c
@@ -0,0 +1,798 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "macros.h"
+#include "copyrite.h"
+#include "main.h"
+#include "statutil.h"
+#include "smalloc.h"
+#include "futil.h"
+#include "smalloc.h"
+#include "edsam.h"
+#include "mdrun.h"
+#include "xmdrun.h"
+#include "checkpoint.h"
+#ifdef GMX_THREAD_MPI
+#include "thread_mpi.h"
+#endif
+
+/* afm stuf */
+#include "pull.h"
+
+/* PLUMED */
+#include "../../Plumed.h"
+int    plumedswitch;
+plumed plumedmain;
+/* END PLUMED */
+
+int cmain(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "The [TT]mdrun[tt] program is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over nodes if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written, unless the option",
+        "[TT]-seppot[tt] is used, in which case each node writes a log file.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "A simulation can be run in parallel using two different parallelization",
+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
+        "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
+        "is only available with the Verlet cut-off scheme, with the (older)",
+        "group scheme only PME-only processes can use OpenMP parallelization.",
+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
+        "hardware resources. With a normal MPI library only the options",
+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
+        "for PME-only processes, can be used to control the number of threads.",
+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
+        "of thread-MPI threads.",
+        "Note that using combined MPI+OpenMP parallelization is almost always",
+        "slower than single parallelization, except at the scaling limit, where",
+        "especially OpenMP parallelization of PME reduces the communication cost.",
+        "OpenMP-only parallelization is much faster than MPI-only parallelization",
+        "on a single CPU(-die). Since we currently don't have proper hardware",
+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
+        "automatically use OpenMP-only parallelization when you use up to 4",
+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
+        "parallelization is used (except with GPUs, see below).",
+        "[PAR]",
+        "To quickly test the performance of the new Verlet cut-off scheme",
+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
+        "the [TT]-testverlet[tt] option. This should not be used for production,",
+        "since it can slightly modify potentials and it will remove charge groups",
+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
+        "charge groups. For production simulations it is highly recommended",
+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
+        "[PAR]",
+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
+        "of GPUs should match the number of MPI processes or MPI threads,",
+        "excluding PME-only processes/threads. With thread-MPI the number",
+        "of MPI threads will automatically be set to the number of GPUs detected.",
+        "When you want to use a subset of the available GPUs, you can use",
+        "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
+        "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
+        "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
+        "variable instead. The format for GMX_GPU_ID is identical to ",
+        "[TT]-gpu_id[tt], but an environment variable can have different values",
+        "on different nodes of a cluster.",
+        "[PAR]",
+        "When using PME with separate PME nodes or with a GPU, the two major",
+        "compute tasks, the non-bonded force calculation and the PME calculation",
+        "run on different compute resources. If this load is not balanced,",
+        "some of the resources will be idle part of time. With the Verlet",
+        "cut-off scheme this load is automatically balanced when the PME load",
+        "is too high (but not when it is too low). This is done by scaling",
+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
+        "few hundred steps different settings are tried and the fastest is chosen",
+        "for the rest of the simulation. This does not affect the accuracy of",
+        "the results, but it does affect the decomposition of the Coulomb energy",
+        "into particle and mesh contributions. The auto-tuning can be turned off",
+        "with the option [TT]-notunepme[tt].",
+        "[PAR]",
+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
+        "even when no multi-threading is used,",
+        "as this usually results in significantly better performance.",
+        "If the queuing systems or the OpenMP library pinned threads, we honor",
+        "this and don't pin again, even though the layout may be sub-optimal.",
+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
+        "or pin threads when using less cores, use [TT]-pin on[tt].",
+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
+        "there are multiple logical cores per physical core.",
+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
+        "With Intel Hyper-Threading 2 is best when using half or less of the",
+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
+        "it minimizes the threads per logical core, to optimize performance.",
+        "If you want to run multiple mdrun jobs on the same physical node,"
+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
+        "When running multiple mdrun (or other) simulations on the same physical",
+        "node, some simulations need to start pinning from a non-zero core",
+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
+        "the offset in logical cores for pinning.",
+        "[PAR]",
+        "When [TT]mdrun[tt] is started using MPI with more than 1 process",
+        "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
+        "By default domain decomposition is used, unless the [TT]-pd[tt]",
+        "option is set, which selects particle decomposition.",
+        "[PAR]",
+        "With domain decomposition, the spatial decomposition can be set",
+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
+        "The user only needs to change this when the system is very inhomogeneous.",
+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
+        "which can give a significant performance improvement,",
+        "especially for inhomogeneous systems. The only disadvantage of",
+        "dynamic load balancing is that runs are no longer binary reproducible,",
+        "but in most cases this is not important.",
+        "By default the dynamic load balancing is automatically turned on",
+        "when the measured performance loss due to load imbalance is 5% or more.",
+        "At low parallelization these are the only important options",
+        "for domain decomposition.",
+        "At high parallelization the options in the next two sections",
+        "could be important for increasing the performace.",
+        "[PAR]",
+        "When PME is used with domain decomposition, separate nodes can",
+        "be assigned to do only the PME mesh calculation;",
+        "this is computationally more efficient starting at about 12 nodes.",
+        "The number of PME nodes is set with option [TT]-npme[tt],",
+        "this can not be more than half of the nodes.",
+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
+        "nodes when the number of nodes is larger than 11 or performance wise",
+        "not compatible with the PME grid x dimension.",
+        "But the user should optimize npme. Performance statistics on this issue",
+        "are written at the end of the log file.",
+        "For good load balancing at high parallelization, the PME grid x and y",
+        "dimensions should be divisible by the number of PME nodes",
+        "(the simulation will run correctly also when this is not the case).",
+        "[PAR]",
+        "This section lists all options that affect the domain decomposition.",
+        "[PAR]",
+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
+        "for inter charge-group bonded interactions.",
+        "Communication for two-body bonded interactions below the non-bonded",
+        "cut-off distance always comes for free with the non-bonded communication.",
+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
+        "missing bonded interactions; this means that the extra cost is minor",
+        "and nearly indepedent of the value of [TT]-rdd[tt].",
+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
+        "the lower limit for the domain decomposition cell sizes.",
+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
+        "the initial coordinates. The chosen value will be a balance",
+        "between interaction range and communication cost.",
+        "[PAR]",
+        "When inter charge-group bonded interactions are beyond",
+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
+        "For pair interactions and tabulated bonds",
+        "that do not generate exclusions, this check can be turned off",
+        "with the option [TT]-noddcheck[tt].",
+        "[PAR]",
+        "When constraints are present, option [TT]-rcon[tt] influences",
+        "the cell size limit as well.",
+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
+        "should not be beyond the smallest cell size. A error message is",
+        "generated when this happens and the user should change the decomposition",
+        "or decrease the LINCS order and increase the number of LINCS iterations.",
+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
+        "in a conservative fashion. For high parallelization it can be useful",
+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
+        "[PAR]",
+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
+        "the cells can scale down by at least this factor. This option is used",
+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
+        "as well as for determining the number of grid pulses, which in turn",
+        "sets the minimum allowed cell size. Under certain circumstances",
+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
+        "high or low spatial inhomogeneity of the system.",
+        "[PAR]",
+        "The option [TT]-gcom[tt] can be used to only do global communication",
+        "every n steps.",
+        "This can improve performance for highly parallel simulations",
+        "where this global communication step becomes the bottleneck.",
+        "For a global thermostat and/or barostat the temperature",
+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
+        "forces and energies will be (re)calculated. Neighbor searching will be",
+        "performed for every frame, unless [TT]nstlist[tt] is zero",
+        "(see the [TT].mdp[tt] file).[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type the table file name is",
+        "modified in a different way: before the file extension an underscore is",
+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
+        "and finally the table number of the interaction type.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [TT].mdp[tt] file.[PAR]",
+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
+        "simulated in parallel.",
+        "As many input files/directories are required as the number of systems. ",
+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
+        "system) and runs in each of them, using the input/output file names, ",
+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
+        "directories.",
+        "With [TT]-multi[tt], the system number is appended to the run input ",
+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
+        "The number of nodes per system is the total number of nodes",
+        "divided by the number of systems.",
+        "One use of this option is for NMR refinement: when distance",
+        "or orientation restraints are present these can be ensemble averaged",
+        "over all the systems.[PAR]",
+        "With [TT]-replex[tt] replica exchange is attempted every given number",
+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
+        "[TT]-multidir[tt] option, described above.",
+        "All run input files should use a different coupling temperature,",
+        "the order of the files is not important. The random seed is set with",
+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
+        "is performed after every exchange.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability and X-ray bombardments.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. The data file should contain the options",
+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
+        "both apply to this as well.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of nodes or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]*0.99 hours.",
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop after the next neighbor search step ",
+        "(with nstlist=0 at the next step).",
+        "In both cases all the usual output will be written to file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+    t_commrec    *cr;
+    t_filenm      fnm[] = {
+        { efTPX, NULL,      NULL,       ffREAD },
+        { efTRN, "-o",      NULL,       ffWRITE },
+        { efXTC, "-x",      NULL,       ffOPTWR },
+        { efCPT, "-cpi",    NULL,       ffOPTRD },
+        { efCPT, "-cpo",    NULL,       ffOPTWR },
+        { efSTO, "-c",      "confout",  ffWRITE },
+        { efEDR, "-e",      "ener",     ffWRITE },
+        { efLOG, "-g",      "md",       ffWRITE },
+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
+        { efXVG, "-field",  "field",    ffOPTWR },
+        { efXVG, "-table",  "table",    ffOPTRD },
+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
+        { efXVG, "-tablep", "tablep",   ffOPTRD },
+        { efXVG, "-tableb", "table",    ffOPTRD },
+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
+        { efEDI, "-ei",     "sam",      ffOPTRD },
+        { efXVG, "-eo",     "edsam",    ffOPTWR },
+        { efGCT, "-j",      "wham",     ffOPTRD },
+        { efGCT, "-jo",     "bam",      ffOPTWR },
+        { efXVG, "-ffout",  "gct",      ffOPTWR },
+        { efXVG, "-devout", "deviatie", ffOPTWR },
+        { efXVG, "-runav",  "runaver",  ffOPTWR },
+        { efXVG, "-px",     "pullx",    ffOPTWR },
+        { efXVG, "-pf",     "pullf",    ffOPTWR },
+        { efXVG, "-ro",     "rotation", ffOPTWR },
+        { efLOG, "-ra",     "rotangles", ffOPTWR },
+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
+        { efLOG, "-rt",     "rottorque", ffOPTWR },
+        { efMTX, "-mtx",    "nm",       ffOPTWR },
+        { efNDX, "-dn",     "dipole",   ffOPTWR },
+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
+        { efDAT, "-membed", "membed",   ffOPTRD },
+        { efTOP, "-mp",     "membed",   ffOPTRD },
+        { efNDX, "-mn",     "membed",   ffOPTRD }
+    };
+#define NFILE asize(fnm)
+
+    /* Command line options ! */
+    gmx_bool      bCart         = FALSE;
+    gmx_bool      bPPPME        = FALSE;
+    gmx_bool      bPartDec      = FALSE;
+    gmx_bool      bDDBondCheck  = TRUE;
+    gmx_bool      bDDBondComm   = TRUE;
+    gmx_bool      bTunePME      = TRUE;
+    gmx_bool      bTestVerlet   = FALSE;
+    gmx_bool      bVerbose      = FALSE;
+    gmx_bool      bCompact      = TRUE;
+    gmx_bool      bSepPot       = FALSE;
+    gmx_bool      bRerunVSite   = FALSE;
+    gmx_bool      bIonize       = FALSE;
+    gmx_bool      bConfout      = TRUE;
+    gmx_bool      bReproducible = FALSE;
+
+    int           npme          = -1;
+    int           nmultisim     = 0;
+    int           nstglobalcomm = -1;
+    int           repl_ex_nst   = 0;
+    int           repl_ex_seed  = -1;
+    int           repl_ex_nex   = 0;
+    int           nstepout      = 100;
+    int           resetstep     = -1;
+    gmx_large_int_t nsteps      = -2; /* the value -2 means that the mdp option will be used */
+
+    rvec          realddxyz          = {0, 0, 0};
+    const char   *ddno_opt[ddnoNR+1] =
+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
+    const char   *dddlb_opt[] =
+    { NULL, "auto", "no", "yes", NULL };
+    const char   *thread_aff_opt[threadaffNR+1] =
+    { NULL, "auto", "on", "off", NULL };
+    const char   *nbpu_opt[] =
+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
+    real          rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
+    char         *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
+    real          cpt_period            = 15.0, max_hours = -1;
+    gmx_bool      bAppendFiles          = TRUE;
+    gmx_bool      bKeepAndNumCPT        = FALSE;
+    gmx_bool      bResetCountersHalfWay = FALSE;
+    output_env_t  oenv                  = NULL;
+    const char   *deviceOptions         = "";
+
+    gmx_hw_opt_t  hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+
+    t_pargs       pa[] = {
+
+        { "-pd",      FALSE, etBOOL, {&bPartDec},
+          "Use particle decompostion" },
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddno_opt},
+          "DD node order" },
+        { "-npme",    FALSE, etINT, {&npme},
+          "Number of separate nodes to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI threads to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
+          "Fix threads (or processes) to specific cores" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
+          "List of GPU id's to use" },
+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&rdd},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&rconstr},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&dlb_scale},
+          "Minimum allowed dlb scaling of the DD cell size" },
+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
+          "HIDDENThe DD cell sizes in x" },
+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
+          "HIDDENThe DD cell sizes in y" },
+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
+          "HIDDENThe DD cell sizes in z" },
+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
+          "Calculate non-bonded interactions on" },
+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
+          "Optimize PME load between PP/PME nodes or GPU/CPU" },
+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
+          "Test the Verlet non-bonded scheme" },
+        { "-v",       FALSE, etBOOL, {&bVerbose},
+          "Be loud and noisy" },
+        { "-compact", FALSE, etBOOL, {&bCompact},
+          "Write a compact log file" },
+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
+          "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&cpt_period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etGMX_LARGE_INT, {&nsteps},
+          "Run this number of steps, overrides .mdp file option" },
+        { "-maxh",   FALSE, etREAL, {&max_hours},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-ionize",  FALSE, etBOOL, {&bIonize},
+          "Do a simulation including the effect of an X-Ray bombardment on your system" },
+        { "-confout", FALSE, etBOOL, {&bConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&nstepout},
+          "HIDDENFrequency of writing the remaining runtime" },
+        { "-resetstep", FALSE, etINT, {&resetstep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    gmx_edsam_t   ed;
+    unsigned long Flags, PCA_Flags;
+    ivec          ddxyz;
+    int           dd_node_order;
+    gmx_bool      bAddPart;
+    FILE         *fplog, *fpmulti;
+    int           sim_part, sim_part_fn;
+    const char   *part_suffix = ".part";
+    char          suffix[STRLEN];
+    int           rc;
+    char        **multidir = NULL;
+
+
+    cr = init_par(&argc, &argv);
+
+    if (MASTER(cr))
+    {
+        CopyRight(stderr, argv[0]);
+    }
+
+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
+                      asize(desc), desc, 0, NULL, &oenv);
+
+
+    /* we set these early because they might be used in init_multisystem()
+       Note that there is the potential for npme>nnodes until the number of
+       threads is set later on, if there's thread parallelization. That shouldn't
+       lead to problems. */
+    dd_node_order = nenum(ddno_opt);
+    cr->npmenodes = npme;
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", NFILE, fnm))
+    {
+        int i;
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
+    }
+
+
+    if (repl_ex_nst != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
+    }
+
+    if (repl_ex_nex < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim > 1)
+    {
+#ifndef GMX_THREAD_MPI
+        gmx_bool bParFn = (multidir == NULL);
+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
+#else
+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
+#endif
+    }
+
+    bAddPart = !bAppendFiles;
+
+    /* Check if there is ANY checkpoint file available */
+    sim_part    = 1;
+    sim_part_fn = sim_part;
+    if (opt2bSet("-cpi", NFILE, fnm))
+    {
+        if (bSepPot && bAppendFiles)
+        {
+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
+        }
+
+        bAppendFiles =
+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
+                                                          fnm, cr),
+                                            &sim_part_fn, NULL, cr,
+                                            bAppendFiles, NFILE, fnm,
+                                            part_suffix, &bAddPart);
+        if (sim_part_fn == 0 && MULTIMASTER(cr))
+        {
+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
+        }
+        else
+        {
+            sim_part = sim_part_fn + 1;
+        }
+
+        if (MULTISIM(cr) && MASTER(cr))
+        {
+            if (MULTIMASTER(cr))
+            {
+                /* Log file is not yet available, so if there's a
+                 * problem we can only write to stderr. */
+                fpmulti = stderr;
+            }
+            else
+            {
+                fpmulti = NULL;
+            }
+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
+        }
+    }
+    else
+    {
+        bAppendFiles = FALSE;
+    }
+
+    if (!bAppendFiles)
+    {
+        sim_part_fn = sim_part;
+    }
+
+    if (bAddPart)
+    {
+        /* Rename all output files (except checkpoint files) */
+        /* create new part name first (zero-filled) */
+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
+
+        add_suffix_to_output_names(fnm, NFILE, suffix);
+        if (MULTIMASTER(cr))
+        {
+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
+        }
+    }
+
+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
+    Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
+    Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
+        CopyRight(fplog, argv[0]);
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+    else if (!MASTER(cr) && bSepPot)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
+    }
+    else
+    {
+        fplog = NULL;
+    }
+
+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+    /* PLUMED */
+    plumedswitch=0;
+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
+    if(plumedswitch){
+      int plumed_is_there=0;
+      int real_precision=sizeof(real);
+      real energyUnits=1.0;
+      real lengthUnits=1.0;
+      real timeUnits=1.0;
+  
+  
+      if(!plumed_installed()){
+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
+      }
+      plumedmain=plumed_create();
+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
+      // this is not necessary for gromacs units:
+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
+      //
+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
+      plumedswitch=1;
+    }
+    /* END PLUMED */
+
+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
+                  nbpu_opt[0],
+                  nsteps, nstepout, resetstep,
+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                  pforce, cpt_period, max_hours, deviceOptions, Flags);
+
+    /* PLUMED */
+    if(plumedswitch){
+      plumed_finalize(plumedmain);
+    }
+    /* END PLUMED */
+  
+    gmx_finalize_par();
+
+    if (MULTIMASTER(cr))
+    {
+        thanx(stderr);
+    }
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !bAppendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed b/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..a5c0e6a6256def6a24b6b0bb702c5eef114b58c5
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed
@@ -0,0 +1,760 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "macros.h"
+#include "copyrite.h"
+#include "main.h"
+#include "statutil.h"
+#include "smalloc.h"
+#include "futil.h"
+#include "smalloc.h"
+#include "edsam.h"
+#include "mdrun.h"
+#include "xmdrun.h"
+#include "checkpoint.h"
+#ifdef GMX_THREAD_MPI
+#include "thread_mpi.h"
+#endif
+
+/* afm stuf */
+#include "pull.h"
+
+int cmain(int argc, char *argv[])
+{
+    const char   *desc[] = {
+        "The [TT]mdrun[tt] program is the main computational chemistry engine",
+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
+        "test particle insertion or (re)calculation of energies.",
+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
+        "builds a Hessian matrix from single conformation.",
+        "For usual Normal Modes-like calculations, make sure that",
+        "the structure provided is properly energy-minimized.",
+        "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
+        "and distributes the topology over nodes if needed.",
+        "[TT]mdrun[tt] produces at least four output files.",
+        "A single log file ([TT]-g[tt]) is written, unless the option",
+        "[TT]-seppot[tt] is used, in which case each node writes a log file.",
+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
+        "optionally forces.",
+        "The structure file ([TT]-c[tt]) contains the coordinates and",
+        "velocities of the last step.",
+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
+        "pressure, etc, a lot of these things are also printed in the log file.",
+        "Optionally coordinates can be written to a compressed trajectory file",
+        "([TT]-x[tt]).[PAR]",
+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
+        "turned on.[PAR]",
+        "A simulation can be run in parallel using two different parallelization",
+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
+        "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
+        "is only available with the Verlet cut-off scheme, with the (older)",
+        "group scheme only PME-only processes can use OpenMP parallelization.",
+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
+        "hardware resources. With a normal MPI library only the options",
+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
+        "for PME-only processes, can be used to control the number of threads.",
+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
+        "of thread-MPI threads.",
+        "Note that using combined MPI+OpenMP parallelization is almost always",
+        "slower than single parallelization, except at the scaling limit, where",
+        "especially OpenMP parallelization of PME reduces the communication cost.",
+        "OpenMP-only parallelization is much faster than MPI-only parallelization",
+        "on a single CPU(-die). Since we currently don't have proper hardware",
+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
+        "automatically use OpenMP-only parallelization when you use up to 4",
+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
+        "parallelization is used (except with GPUs, see below).",
+        "[PAR]",
+        "To quickly test the performance of the new Verlet cut-off scheme",
+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
+        "the [TT]-testverlet[tt] option. This should not be used for production,",
+        "since it can slightly modify potentials and it will remove charge groups",
+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
+        "charge groups. For production simulations it is highly recommended",
+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
+        "[PAR]",
+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
+        "of GPUs should match the number of MPI processes or MPI threads,",
+        "excluding PME-only processes/threads. With thread-MPI the number",
+        "of MPI threads will automatically be set to the number of GPUs detected.",
+        "When you want to use a subset of the available GPUs, you can use",
+        "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
+        "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
+        "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
+        "variable instead. The format for GMX_GPU_ID is identical to ",
+        "[TT]-gpu_id[tt], but an environment variable can have different values",
+        "on different nodes of a cluster.",
+        "[PAR]",
+        "When using PME with separate PME nodes or with a GPU, the two major",
+        "compute tasks, the non-bonded force calculation and the PME calculation",
+        "run on different compute resources. If this load is not balanced,",
+        "some of the resources will be idle part of time. With the Verlet",
+        "cut-off scheme this load is automatically balanced when the PME load",
+        "is too high (but not when it is too low). This is done by scaling",
+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
+        "few hundred steps different settings are tried and the fastest is chosen",
+        "for the rest of the simulation. This does not affect the accuracy of",
+        "the results, but it does affect the decomposition of the Coulomb energy",
+        "into particle and mesh contributions. The auto-tuning can be turned off",
+        "with the option [TT]-notunepme[tt].",
+        "[PAR]",
+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
+        "even when no multi-threading is used,",
+        "as this usually results in significantly better performance.",
+        "If the queuing systems or the OpenMP library pinned threads, we honor",
+        "this and don't pin again, even though the layout may be sub-optimal.",
+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
+        "or pin threads when using less cores, use [TT]-pin on[tt].",
+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
+        "there are multiple logical cores per physical core.",
+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
+        "With Intel Hyper-Threading 2 is best when using half or less of the",
+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
+        "it minimizes the threads per logical core, to optimize performance.",
+        "If you want to run multiple mdrun jobs on the same physical node,"
+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
+        "When running multiple mdrun (or other) simulations on the same physical",
+        "node, some simulations need to start pinning from a non-zero core",
+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
+        "the offset in logical cores for pinning.",
+        "[PAR]",
+        "When [TT]mdrun[tt] is started using MPI with more than 1 process",
+        "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
+        "By default domain decomposition is used, unless the [TT]-pd[tt]",
+        "option is set, which selects particle decomposition.",
+        "[PAR]",
+        "With domain decomposition, the spatial decomposition can be set",
+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
+        "The user only needs to change this when the system is very inhomogeneous.",
+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
+        "which can give a significant performance improvement,",
+        "especially for inhomogeneous systems. The only disadvantage of",
+        "dynamic load balancing is that runs are no longer binary reproducible,",
+        "but in most cases this is not important.",
+        "By default the dynamic load balancing is automatically turned on",
+        "when the measured performance loss due to load imbalance is 5% or more.",
+        "At low parallelization these are the only important options",
+        "for domain decomposition.",
+        "At high parallelization the options in the next two sections",
+        "could be important for increasing the performace.",
+        "[PAR]",
+        "When PME is used with domain decomposition, separate nodes can",
+        "be assigned to do only the PME mesh calculation;",
+        "this is computationally more efficient starting at about 12 nodes.",
+        "The number of PME nodes is set with option [TT]-npme[tt],",
+        "this can not be more than half of the nodes.",
+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
+        "nodes when the number of nodes is larger than 11 or performance wise",
+        "not compatible with the PME grid x dimension.",
+        "But the user should optimize npme. Performance statistics on this issue",
+        "are written at the end of the log file.",
+        "For good load balancing at high parallelization, the PME grid x and y",
+        "dimensions should be divisible by the number of PME nodes",
+        "(the simulation will run correctly also when this is not the case).",
+        "[PAR]",
+        "This section lists all options that affect the domain decomposition.",
+        "[PAR]",
+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
+        "for inter charge-group bonded interactions.",
+        "Communication for two-body bonded interactions below the non-bonded",
+        "cut-off distance always comes for free with the non-bonded communication.",
+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
+        "missing bonded interactions; this means that the extra cost is minor",
+        "and nearly indepedent of the value of [TT]-rdd[tt].",
+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
+        "the lower limit for the domain decomposition cell sizes.",
+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
+        "the initial coordinates. The chosen value will be a balance",
+        "between interaction range and communication cost.",
+        "[PAR]",
+        "When inter charge-group bonded interactions are beyond",
+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
+        "For pair interactions and tabulated bonds",
+        "that do not generate exclusions, this check can be turned off",
+        "with the option [TT]-noddcheck[tt].",
+        "[PAR]",
+        "When constraints are present, option [TT]-rcon[tt] influences",
+        "the cell size limit as well.",
+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
+        "should not be beyond the smallest cell size. A error message is",
+        "generated when this happens and the user should change the decomposition",
+        "or decrease the LINCS order and increase the number of LINCS iterations.",
+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
+        "in a conservative fashion. For high parallelization it can be useful",
+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
+        "[PAR]",
+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
+        "the cells can scale down by at least this factor. This option is used",
+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
+        "as well as for determining the number of grid pulses, which in turn",
+        "sets the minimum allowed cell size. Under certain circumstances",
+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
+        "high or low spatial inhomogeneity of the system.",
+        "[PAR]",
+        "The option [TT]-gcom[tt] can be used to only do global communication",
+        "every n steps.",
+        "This can improve performance for highly parallel simulations",
+        "where this global communication step becomes the bottleneck.",
+        "For a global thermostat and/or barostat the temperature",
+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
+        "forces and energies will be (re)calculated. Neighbor searching will be",
+        "performed for every frame, unless [TT]nstlist[tt] is zero",
+        "(see the [TT].mdp[tt] file).[PAR]",
+        "ED (essential dynamics) sampling and/or additional flooding potentials",
+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
+        "or by using options in the essdyn menu of the WHAT IF program.",
+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
+        "contains projections of positions, velocities and forces onto selected",
+        "eigenvectors.[PAR]",
+        "When user-defined potential functions have been selected in the",
+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
+        "a formatted table with potential functions. The file is read from",
+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
+        "normal Coulomb.",
+        "When pair interactions are present, a separate table for pair interaction",
+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
+        "When tabulated bonded functions are present in the topology,",
+        "interaction functions are read using the [TT]-tableb[tt] option.",
+        "For each different tabulated interaction type the table file name is",
+        "modified in a different way: before the file extension an underscore is",
+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
+        "and finally the table number of the interaction type.[PAR]",
+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
+        "coordinates and forces when pulling is selected",
+        "in the [TT].mdp[tt] file.[PAR]",
+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
+        "simulated in parallel.",
+        "As many input files/directories are required as the number of systems. ",
+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
+        "system) and runs in each of them, using the input/output file names, ",
+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
+        "directories.",
+        "With [TT]-multi[tt], the system number is appended to the run input ",
+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
+        "The number of nodes per system is the total number of nodes",
+        "divided by the number of systems.",
+        "One use of this option is for NMR refinement: when distance",
+        "or orientation restraints are present these can be ensemble averaged",
+        "over all the systems.[PAR]",
+        "With [TT]-replex[tt] replica exchange is attempted every given number",
+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
+        "[TT]-multidir[tt] option, described above.",
+        "All run input files should use a different coupling temperature,",
+        "the order of the files is not important. The random seed is set with",
+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
+        "is performed after every exchange.[PAR]",
+        "Finally some experimental algorithms can be tested when the",
+        "appropriate options have been given. Currently under",
+        "investigation are: polarizability and X-ray bombardments.",
+        "[PAR]",
+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
+        "a protein into a membrane. The data file should contain the options",
+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
+        "both apply to this as well.",
+        "[PAR]",
+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
+        "crashes due to too large forces. With this option coordinates and",
+        "forces of atoms with a force larger than a certain value will",
+        "be printed to stderr.",
+        "[PAR]",
+        "Checkpoints containing the complete state of the system are written",
+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
+        "unless option [TT]-cpt[tt] is set to -1.",
+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
+        "make sure that a recent state of the system is always available,",
+        "even when the simulation is terminated while writing a checkpoint.",
+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
+        "with the step number.",
+        "A simulation can be continued by reading the full state from file",
+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
+        "will be appending to the existing output files. The checkpoint file",
+        "contains checksums of all output files, such that you will never",
+        "loose data when some output files are modified, corrupt or removed.",
+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
+        "[TT]*[tt] all files are present with names and checksums matching those stored",
+        "in the checkpoint file: files are appended[PAR]",
+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
+        "With [TT]-noappend[tt] new output files are opened and the simulation",
+        "part number is added to all output file names.",
+        "Note that in all cases the checkpoint file itself is not renamed",
+        "and will be overwritten, unless its name does not match",
+        "the [TT]-cpo[tt] option.",
+        "[PAR]",
+        "With checkpointing the output is appended to previously written",
+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
+        "output files are present (except for the checkpoint file).",
+        "The integrity of the files to be appended is verified using checksums",
+        "which are stored in the checkpoint file. This ensures that output can",
+        "not be mixed up or corrupted due to file appending. When only some",
+        "of the previous output files are present, a fatal error is generated",
+        "and no old output files are modified and no new output files are opened.",
+        "The result with appending will be the same as from a single run.",
+        "The contents will be binary identical, unless you use a different number",
+        "of nodes or dynamic load balancing or the FFT library uses optimizations",
+        "through timing.",
+        "[PAR]",
+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
+        "file is written at the first neighbor search step where the run time",
+        "exceeds [TT]-maxh[tt]*0.99 hours.",
+        "[PAR]",
+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
+        "pressed), it will stop after the next neighbor search step ",
+        "(with nstlist=0 at the next step).",
+        "In both cases all the usual output will be written to file.",
+        "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
+        "is sufficient, this signal should not be sent to mpirun or",
+        "the [TT]mdrun[tt] process that is the parent of the others.",
+        "[PAR]",
+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
+    };
+    t_commrec    *cr;
+    t_filenm      fnm[] = {
+        { efTPX, NULL,      NULL,       ffREAD },
+        { efTRN, "-o",      NULL,       ffWRITE },
+        { efXTC, "-x",      NULL,       ffOPTWR },
+        { efCPT, "-cpi",    NULL,       ffOPTRD },
+        { efCPT, "-cpo",    NULL,       ffOPTWR },
+        { efSTO, "-c",      "confout",  ffWRITE },
+        { efEDR, "-e",      "ener",     ffWRITE },
+        { efLOG, "-g",      "md",       ffWRITE },
+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
+        { efXVG, "-field",  "field",    ffOPTWR },
+        { efXVG, "-table",  "table",    ffOPTRD },
+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
+        { efXVG, "-tablep", "tablep",   ffOPTRD },
+        { efXVG, "-tableb", "table",    ffOPTRD },
+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
+        { efEDI, "-ei",     "sam",      ffOPTRD },
+        { efXVG, "-eo",     "edsam",    ffOPTWR },
+        { efGCT, "-j",      "wham",     ffOPTRD },
+        { efGCT, "-jo",     "bam",      ffOPTWR },
+        { efXVG, "-ffout",  "gct",      ffOPTWR },
+        { efXVG, "-devout", "deviatie", ffOPTWR },
+        { efXVG, "-runav",  "runaver",  ffOPTWR },
+        { efXVG, "-px",     "pullx",    ffOPTWR },
+        { efXVG, "-pf",     "pullf",    ffOPTWR },
+        { efXVG, "-ro",     "rotation", ffOPTWR },
+        { efLOG, "-ra",     "rotangles", ffOPTWR },
+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
+        { efLOG, "-rt",     "rottorque", ffOPTWR },
+        { efMTX, "-mtx",    "nm",       ffOPTWR },
+        { efNDX, "-dn",     "dipole",   ffOPTWR },
+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
+        { efDAT, "-membed", "membed",   ffOPTRD },
+        { efTOP, "-mp",     "membed",   ffOPTRD },
+        { efNDX, "-mn",     "membed",   ffOPTRD }
+    };
+#define NFILE asize(fnm)
+
+    /* Command line options ! */
+    gmx_bool      bCart         = FALSE;
+    gmx_bool      bPPPME        = FALSE;
+    gmx_bool      bPartDec      = FALSE;
+    gmx_bool      bDDBondCheck  = TRUE;
+    gmx_bool      bDDBondComm   = TRUE;
+    gmx_bool      bTunePME      = TRUE;
+    gmx_bool      bTestVerlet   = FALSE;
+    gmx_bool      bVerbose      = FALSE;
+    gmx_bool      bCompact      = TRUE;
+    gmx_bool      bSepPot       = FALSE;
+    gmx_bool      bRerunVSite   = FALSE;
+    gmx_bool      bIonize       = FALSE;
+    gmx_bool      bConfout      = TRUE;
+    gmx_bool      bReproducible = FALSE;
+
+    int           npme          = -1;
+    int           nmultisim     = 0;
+    int           nstglobalcomm = -1;
+    int           repl_ex_nst   = 0;
+    int           repl_ex_seed  = -1;
+    int           repl_ex_nex   = 0;
+    int           nstepout      = 100;
+    int           resetstep     = -1;
+    gmx_large_int_t nsteps      = -2; /* the value -2 means that the mdp option will be used */
+
+    rvec          realddxyz          = {0, 0, 0};
+    const char   *ddno_opt[ddnoNR+1] =
+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
+    const char   *dddlb_opt[] =
+    { NULL, "auto", "no", "yes", NULL };
+    const char   *thread_aff_opt[threadaffNR+1] =
+    { NULL, "auto", "on", "off", NULL };
+    const char   *nbpu_opt[] =
+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
+    real          rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
+    char         *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
+    real          cpt_period            = 15.0, max_hours = -1;
+    gmx_bool      bAppendFiles          = TRUE;
+    gmx_bool      bKeepAndNumCPT        = FALSE;
+    gmx_bool      bResetCountersHalfWay = FALSE;
+    output_env_t  oenv                  = NULL;
+    const char   *deviceOptions         = "";
+
+    gmx_hw_opt_t  hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+
+    t_pargs       pa[] = {
+
+        { "-pd",      FALSE, etBOOL, {&bPartDec},
+          "Use particle decompostion" },
+        { "-dd",      FALSE, etRVEC, {&realddxyz},
+          "Domain decomposition grid, 0 is optimize" },
+        { "-ddorder", FALSE, etENUM, {ddno_opt},
+          "DD node order" },
+        { "-npme",    FALSE, etINT, {&npme},
+          "Number of separate nodes to be used for PME, -1 is guess" },
+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+          "Total number of threads to start (0 is guess)" },
+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+          "Number of thread-MPI threads to start (0 is guess)" },
+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+          "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+          "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
+          "Fix threads (or processes) to specific cores" },
+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
+          "List of GPU id's to use" },
+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
+          "Check for all bonded interactions with DD" },
+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
+        { "-rdd",     FALSE, etREAL, {&rdd},
+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
+        { "-rcon",    FALSE, etREAL, {&rconstr},
+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
+          "Dynamic load balancing (with DD)" },
+        { "-dds",     FALSE, etREAL, {&dlb_scale},
+          "Minimum allowed dlb scaling of the DD cell size" },
+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
+          "HIDDENThe DD cell sizes in x" },
+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
+          "HIDDENThe DD cell sizes in y" },
+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
+          "HIDDENThe DD cell sizes in z" },
+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
+          "Global communication frequency" },
+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
+          "Calculate non-bonded interactions on" },
+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
+          "Optimize PME load between PP/PME nodes or GPU/CPU" },
+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
+          "Test the Verlet non-bonded scheme" },
+        { "-v",       FALSE, etBOOL, {&bVerbose},
+          "Be loud and noisy" },
+        { "-compact", FALSE, etBOOL, {&bCompact},
+          "Write a compact log file" },
+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
+          "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
+        { "-pforce",  FALSE, etREAL, {&pforce},
+          "Print all forces larger than this (kJ/mol nm)" },
+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
+          "Try to avoid optimizations that affect binary reproducibility" },
+        { "-cpt",     FALSE, etREAL, {&cpt_period},
+          "Checkpoint interval (minutes)" },
+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
+          "Keep and number checkpoint files" },
+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+        { "-nsteps",  FALSE, etGMX_LARGE_INT, {&nsteps},
+          "Run this number of steps, overrides .mdp file option" },
+        { "-maxh",   FALSE, etREAL, {&max_hours},
+          "Terminate after 0.99 times this time (hours)" },
+        { "-multi",   FALSE, etINT, {&nmultisim},
+          "Do multiple simulations in parallel" },
+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
+          "Attempt replica exchange periodically with this period (steps)" },
+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
+          "Seed for replica exchange, -1 is generate a seed" },
+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
+        { "-ionize",  FALSE, etBOOL, {&bIonize},
+          "Do a simulation including the effect of an X-Ray bombardment on your system" },
+        { "-confout", FALSE, etBOOL, {&bConfout},
+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
+        { "-stepout", FALSE, etINT, {&nstepout},
+          "HIDDENFrequency of writing the remaining runtime" },
+        { "-resetstep", FALSE, etINT, {&resetstep},
+          "HIDDENReset cycle counters after these many time steps" },
+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
+    };
+    gmx_edsam_t   ed;
+    unsigned long Flags, PCA_Flags;
+    ivec          ddxyz;
+    int           dd_node_order;
+    gmx_bool      bAddPart;
+    FILE         *fplog, *fpmulti;
+    int           sim_part, sim_part_fn;
+    const char   *part_suffix = ".part";
+    char          suffix[STRLEN];
+    int           rc;
+    char        **multidir = NULL;
+
+
+    cr = init_par(&argc, &argv);
+
+    if (MASTER(cr))
+    {
+        CopyRight(stderr, argv[0]);
+    }
+
+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
+
+    /* Comment this in to do fexist calls only on master
+     * works not with rerun or tables at the moment
+     * also comment out the version of init_forcerec in md.c
+     * with NULL instead of opt2fn
+     */
+    /*
+       if (!MASTER(cr))
+       {
+       PCA_Flags |= PCA_NOT_READ_NODE;
+       }
+     */
+
+    parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
+                      asize(desc), desc, 0, NULL, &oenv);
+
+
+    /* we set these early because they might be used in init_multisystem()
+       Note that there is the potential for npme>nnodes until the number of
+       threads is set later on, if there's thread parallelization. That shouldn't
+       lead to problems. */
+    dd_node_order = nenum(ddno_opt);
+    cr->npmenodes = npme;
+
+    hw_opt.thread_affinity = nenum(thread_aff_opt);
+
+    /* now check the -multi and -multidir option */
+    if (opt2bSet("-multidir", NFILE, fnm))
+    {
+        int i;
+        if (nmultisim > 0)
+        {
+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
+        }
+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
+    }
+
+
+    if (repl_ex_nst != 0 && nmultisim < 2)
+    {
+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
+    }
+
+    if (repl_ex_nex < 0)
+    {
+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
+    }
+
+    if (nmultisim > 1)
+    {
+#ifndef GMX_THREAD_MPI
+        gmx_bool bParFn = (multidir == NULL);
+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
+#else
+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
+#endif
+    }
+
+    bAddPart = !bAppendFiles;
+
+    /* Check if there is ANY checkpoint file available */
+    sim_part    = 1;
+    sim_part_fn = sim_part;
+    if (opt2bSet("-cpi", NFILE, fnm))
+    {
+        if (bSepPot && bAppendFiles)
+        {
+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
+        }
+
+        bAppendFiles =
+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
+                                                          fnm, cr),
+                                            &sim_part_fn, NULL, cr,
+                                            bAppendFiles, NFILE, fnm,
+                                            part_suffix, &bAddPart);
+        if (sim_part_fn == 0 && MULTIMASTER(cr))
+        {
+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
+        }
+        else
+        {
+            sim_part = sim_part_fn + 1;
+        }
+
+        if (MULTISIM(cr) && MASTER(cr))
+        {
+            if (MULTIMASTER(cr))
+            {
+                /* Log file is not yet available, so if there's a
+                 * problem we can only write to stderr. */
+                fpmulti = stderr;
+            }
+            else
+            {
+                fpmulti = NULL;
+            }
+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
+        }
+    }
+    else
+    {
+        bAppendFiles = FALSE;
+    }
+
+    if (!bAppendFiles)
+    {
+        sim_part_fn = sim_part;
+    }
+
+    if (bAddPart)
+    {
+        /* Rename all output files (except checkpoint files) */
+        /* create new part name first (zero-filled) */
+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
+
+        add_suffix_to_output_names(fnm, NFILE, suffix);
+        if (MULTIMASTER(cr))
+        {
+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
+        }
+    }
+
+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
+    Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
+    Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
+
+
+    /* We postpone opening the log file if we are appending, so we can
+       first truncate the old log file and append to the correct position
+       there instead.  */
+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
+        CopyRight(fplog, argv[0]);
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
+    else if (!MASTER(cr) && bSepPot)
+    {
+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
+    }
+    else
+    {
+        fplog = NULL;
+    }
+
+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
+
+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
+                  nbpu_opt[0],
+                  nsteps, nstepout, resetstep,
+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                  pforce, cpt_period, max_hours, deviceOptions, Flags);
+
+    gmx_finalize_par();
+
+    if (MULTIMASTER(cr))
+    {
+        thanx(stderr);
+    }
+
+    /* Log file has to be closed in mdrunner if we are appending to it
+       (fplog not set here) */
+    if (MASTER(cr) && !bAppendFiles)
+    {
+        gmx_log_close(fplog);
+    }
+
+    return rc;
+}
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c b/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c
new file mode 100644
index 0000000000000000000000000000000000000000..2e79c67ad53769d9b8df84b99b66ca2bfa18de30
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c
@@ -0,0 +1,1520 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "repl_ex.h"
+#include "network.h"
+#include "random.h"
+#include "smalloc.h"
+#include "physics.h"
+#include "copyrite.h"
+#include "macros.h"
+#include "vec.h"
+#include "names.h"
+#include "mvdata.h"
+#include "domdec.h"
+#include "partdec.h"
+
+/* PLUMED */
+#include "../../Plumed.h"
+extern int    plumedswitch;
+extern plumed plumedmain;
+/* END PLUMED */
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int      repl;
+    int      nrepl;
+    real     temp;
+    int      type;
+    real   **q;
+    gmx_bool bNPT;
+    real    *pres;
+    int     *ind;
+    int     *allswaps;
+    int      nst;
+    int      nex;
+    int      seed;
+    int      nattempt[2];
+    real    *prob_sum;
+    int    **nmoves;
+    int     *nexchange;
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(FILE *fplog, const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      i, s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    /* PLUMED */
+    //bDiff = FALSE;
+    //for (s = 1; s < ms->nsim; s++)
+    //{
+    //    if (qall[s] != qall[0])
+    //    {
+            bDiff = TRUE;
+    //    }
+    //}
+    /* END PLUMED */
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+                                    const gmx_multisim_t *ms,
+                                    const t_state *state,
+                                    const t_inputrec *ir,
+                                    int nst, int nex, int init_seed)
+{
+    real                temp, pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == NULL || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_large_int(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    check_multi_large_int(fplog, ms, (ir->init_step+nst-1)/nst,
+                          "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(fplog, ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(fplog, ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    /* PLUMED */
+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
+    // in those cases replicas can share the same temperature.
+    /*
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+    */
+    /* END PLUMED */
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (init_seed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = make_seed();
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = init_seed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = nex;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t *ms, int b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_ints(const gmx_multisim_t *ms, int b, int *v, int n)
+{
+    int *buf;
+    int  i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_doubles(const gmx_multisim_t *ms, int b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t *ms, int b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+    exchange_rvecs(ms, b, state->x, state->natoms);
+    exchange_rvecs(ms, b, state->v, state->natoms);
+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
+}
+
+static void copy_rvecs(rvec *s, rvec *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(s[i], d[i]);
+        }
+    }
+}
+
+static void copy_doubles(const double *s, double *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_reals(const real *s, real *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_ints(const int *s, int *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
+
+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    scopy_rvecs(box, DIM);
+    scopy_rvecs(box_rel, DIM);
+    scopy_rvecs(boxv, DIM);
+    state_local->veta = state->veta;
+    state_local->vol0 = state->vol0;
+    scopy_rvecs(svir_prev, DIM);
+    scopy_rvecs(fvir_prev, DIM);
+    scopy_rvecs(pres_prev, DIM);
+    scopy_doubles(nosehoover_xi, ngtc);
+    scopy_doubles(nosehoover_vxi, ngtc);
+    scopy_doubles(nhpres_xi, nnhpres);
+    scopy_doubles(nhpres_vxi, nnhpres);
+    scopy_doubles(therm_integral, state->ngtc);
+    scopy_rvecs(x, state->natoms);
+    scopy_rvecs(v, state->natoms);
+    scopy_rvecs(sd_X, state->natoms);
+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+    scopy_reals(lambda, efptNR);
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (state->v)
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void pd_collect_state(const t_commrec *cr, t_state *state)
+{
+    int shift;
+
+    if (debug)
+    {
+        fprintf(debug, "Collecting state before exchange\n");
+    }
+    shift = cr->nnodes - cr->npmenodes - 1;
+    move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->x, NULL, shift, NULL);
+    if (state->v)
+    {
+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->v, NULL, shift, NULL);
+    }
+    if (state->sd_X)
+    {
+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->sd_X, NULL, shift, NULL);
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, const char *leg, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *ind, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\nb", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          gmx_enerdata_t       *enerd,
+                          real                  vol,
+                          gmx_large_int_t       step,
+                          real                  time)
+{
+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real      ediff = 0, delta = 0, dpV = 0;
+    gmx_bool  bPrint, bMultiEx;
+    gmx_bool *bEx      = re->bEx;
+    real     *prob     = re->prob;
+    int      *pind     = re->destinations; /* permuted index */
+    gmx_bool  bEpot    = FALSE;
+    gmx_bool  bDLambda = FALSE;
+    gmx_bool  bVol     = FALSE;
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    /* PLUMED */
+    int plumed_test_exchange_pattern=0;
+    /* END PLUMED */
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        for (i = 0; i < re->nex; i++)
+        {
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = (int)(re->nrepl*rando(&(re->seed)));
+            i1 = (int)(re->nrepl*rando(&(re->seed)));
+            if (i0 == i1)
+            {
+                i--;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                /* roll a number to determine if accepted */
+                bEx[0] = (rando(&(re->seed)) < prob[0]);
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, re->ind, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+        m = (step / re->nst) % 2;
+        /* PLUMED */
+        if(plumedswitch){
+          int partner=re->repl;
+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
+          if(plumed_test_exchange_pattern>0){
+            int *list;
+            snew(list,re->nrepl);
+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
+            plumed_cmd(plumedmain,"getExchangesList",list);
+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
+            sfree(list);
+          }
+
+          for(i=1; i<re->nrepl; i++) {
+            if (i % 2 != m) continue;
+            a = re->ind[i-1];
+            b = re->ind[i];
+            if(re->repl==a) partner=b;
+            if(re->repl==b) partner=a;
+          }
+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
+          plumed_cmd(plumedmain,"GREX calculate",NULL);
+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
+        }
+        /* END PLUMED */
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                /* PLUMED */
+                if(plumedswitch){
+                  real adb,bdb,dplumed;
+                  char buf[300];
+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
+                  delta+=dplumed;
+                  if (bPrint)
+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
+                }
+                /* END PLUMED */
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    /* roll a number to determine if accepted */
+                    bEx[i] = (rando(&(re->seed)) < prob[i]);
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                  /* PLUMED */
+                  if(!plumed_test_exchange_pattern) {
+                    /* standard neighbour swapping */
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  } else {
+                    /* alternative swapping patterns */
+                    tmp       = pind[a];
+                    pind[a]   = pind[b];
+                    pind[b]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                  }
+                  /* END PLUMED */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* PLUMED */
+    if(plumed_test_exchange_pattern>0) {
+      for (i = 0; i < re->nrepl; i++)
+      {
+          re->ind[i] = i;
+      }
+    }
+    /* END PLUMED */
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void write_debug_x(t_state *state)
+{
+    int i;
+
+    if (debug)
+    {
+        for (i = 0; i < state->natoms; i += 10)
+        {
+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
+        }
+    }
+}
+
+static void
+cyclic_decomposition(FILE      *fplog,
+                     const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(FILE     *fplog,
+                       int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(fplog, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(fplog, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(fplog, "\n");
+        }
+        fflush(fplog);
+    }
+}
+
+static void
+prepare_to_do_exchange(FILE      *fplog,
+                       const int *destinations,
+                       const int  replica_id,
+                       const int  nrepl,
+                       int       *maxswap,
+                       int      **order,
+                       int      **cyclic,
+                       int       *incycle,
+                       gmx_bool  *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < nrepl; i++)
+    {
+        if (destinations[i] != i)
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < nrepl; i++)
+        {
+            for (j = 0; j < nrepl; j++)
+            {
+                cyclic[i][j] = -1;
+                order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(fplog, destinations, cyclic, incycle, nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_large_int_t step, real time)
+{
+    int i, j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    /* PLUMED */
+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
+    /* END PLUMED */
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
+                               re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (PAR(cr))
+    {
+#ifdef GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+
+        if (PAR(cr))
+        {
+            /* Collect the global state on the master node */
+            if (DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state_local, state);
+            }
+            else
+            {
+                pd_collect_state(cr, state);
+            }
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    //if (debug)
+                    //{
+                        fprintf(fplog, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    //}
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_nonatomdata(state, state_local);
+
+            if (PAR(cr))
+            {
+                bcast_state(cr, state, FALSE);
+            }
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, "", re->nrepl, re->nmoves, re->nattempt);
+}
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed b/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed
new file mode 100644
index 0000000000000000000000000000000000000000..60cd71848a783e2395493ef9bd3b75d3fdfd9a95
--- /dev/null
+++ b/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed
@@ -0,0 +1,1441 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "repl_ex.h"
+#include "network.h"
+#include "random.h"
+#include "smalloc.h"
+#include "physics.h"
+#include "copyrite.h"
+#include "macros.h"
+#include "vec.h"
+#include "names.h"
+#include "mvdata.h"
+#include "domdec.h"
+#include "partdec.h"
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+enum {
+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+   it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+
+typedef struct gmx_repl_ex
+{
+    int      repl;
+    int      nrepl;
+    real     temp;
+    int      type;
+    real   **q;
+    gmx_bool bNPT;
+    real    *pres;
+    int     *ind;
+    int     *allswaps;
+    int      nst;
+    int      nex;
+    int      seed;
+    int      nattempt[2];
+    real    *prob_sum;
+    int    **nmoves;
+    int     *nexchange;
+
+    /* these are helper arrays for replica exchange; allocated here so they
+       don't have to be allocated each time */
+    int      *destinations;
+    int     **cyclic;
+    int     **order;
+    int      *tmpswap;
+    gmx_bool *incycle;
+    gmx_bool *bEx;
+
+    /* helper arrays to hold the quantities that are exchanged */
+    real  *prob;
+    real  *Epot;
+    real  *beta;
+    real  *Vol;
+    real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(FILE *fplog, const gmx_multisim_t *ms,
+                              struct gmx_repl_ex *re, int ere, real q)
+{
+    real    *qall;
+    gmx_bool bDiff;
+    int      i, s;
+
+    snew(qall, ms->nsim);
+    qall[re->repl] = q;
+    gmx_sum_sim(ms->nsim, qall, ms);
+
+    bDiff = FALSE;
+    for (s = 1; s < ms->nsim; s++)
+    {
+        if (qall[s] != qall[0])
+        {
+            bDiff = TRUE;
+        }
+    }
+
+    if (bDiff)
+    {
+        /* Set the replica exchange type and quantities */
+        re->type = ere;
+
+        snew(re->q[ere], re->nrepl);
+        for (s = 0; s < ms->nsim; s++)
+        {
+            re->q[ere][s] = qall[s];
+        }
+    }
+    sfree(qall);
+    return bDiff;
+}
+
+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+                                    const gmx_multisim_t *ms,
+                                    const t_state *state,
+                                    const t_inputrec *ir,
+                                    int nst, int nex, int init_seed)
+{
+    real                temp, pres;
+    int                 i, j, k;
+    struct gmx_repl_ex *re;
+    gmx_bool            bTemp;
+    gmx_bool            bLambda = FALSE;
+
+    fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+    if (ms == NULL || ms->nsim == 1)
+    {
+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+    }
+
+    snew(re, 1);
+
+    re->repl     = ms->sim;
+    re->nrepl    = ms->nsim;
+    snew(re->q, ereENDSINGLE);
+
+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+
+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+    check_multi_large_int(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+    check_multi_large_int(fplog, ms, (ir->init_step+nst-1)/nst,
+                          "first exchange step: init_step/-replex", FALSE);
+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+    check_multi_int(fplog, ms, ir->opts.ngtc,
+                    "the number of temperature coupling groups", FALSE);
+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+    re->temp = ir->opts.ref_t[0];
+    for (i = 1; (i < ir->opts.ngtc); i++)
+    {
+        if (ir->opts.ref_t[i] != re->temp)
+        {
+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+        }
+    }
+
+    re->type = -1;
+    bTemp    = repl_quantity(fplog, ms, re, ereTEMP, re->temp);
+    if (ir->efep != efepNO)
+    {
+        bLambda = repl_quantity(fplog, ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+    }
+    if (re->type == -1)  /* nothing was assigned */
+    {
+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+    }
+    if (bLambda && bTemp)
+    {
+        re->type = ereTL;
+    }
+
+    if (bTemp)
+    {
+        please_cite(fplog, "Sugita1999a");
+        if (ir->epc != epcNO)
+        {
+            re->bNPT = TRUE;
+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+            please_cite(fplog, "Okabe2001a");
+        }
+        if (ir->etc == etcBERENDSEN)
+        {
+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+        }
+    }
+    if (bLambda)
+    {
+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+        {
+            gmx_fatal(FARGS, "delta_lambda is not zero");
+        }
+    }
+    if (re->bNPT)
+    {
+        snew(re->pres, re->nrepl);
+        if (ir->epct == epctSURFACETENSION)
+        {
+            pres = ir->ref_p[ZZ][ZZ];
+        }
+        else
+        {
+            pres = 0;
+            j    = 0;
+            for (i = 0; i < DIM; i++)
+            {
+                if (ir->compress[i][i] != 0)
+                {
+                    pres += ir->ref_p[i][i];
+                    j++;
+                }
+            }
+            pres /= j;
+        }
+        re->pres[re->repl] = pres;
+        gmx_sum_sim(re->nrepl, re->pres, ms);
+    }
+
+    /* Make an index for increasing replica order */
+    /* only makes sense if one or the other is varying, not both!
+       if both are varying, we trust the order the person gave. */
+    snew(re->ind, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->ind[i] = i;
+    }
+
+    if (re->type < ereENDSINGLE)
+    {
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = i+1; j < re->nrepl; j++)
+            {
+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+                {
+                    k          = re->ind[i];
+                    re->ind[i] = re->ind[j];
+                    re->ind[j] = k;
+                }
+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+                {
+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+                }
+            }
+        }
+    }
+
+    /* keep track of all the swaps, starting with the initial placement. */
+    snew(re->allswaps, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->allswaps[i] = re->ind[i];
+    }
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            fprintf(fplog, "\nReplica exchange in temperature\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereLAMBDA:
+            fprintf(fplog, "\nReplica exchange in lambda\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        case ereTL:
+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            for (i = 0; i < re->nrepl; i++)
+            {
+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+            }
+            fprintf(fplog, "\n");
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (re->bNPT)
+    {
+        fprintf(fplog, "\nRepl  p");
+        for (i = 0; i < re->nrepl; i++)
+        {
+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+        }
+
+        for (i = 0; i < re->nrepl; i++)
+        {
+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+            {
+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+            }
+        }
+    }
+    re->nst = nst;
+    if (init_seed == -1)
+    {
+        if (MASTERSIM(ms))
+        {
+            re->seed = make_seed();
+        }
+        else
+        {
+            re->seed = 0;
+        }
+        gmx_sumi_sim(1, &(re->seed), ms);
+    }
+    else
+    {
+        re->seed = init_seed;
+    }
+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+    re->nattempt[0] = 0;
+    re->nattempt[1] = 0;
+
+    snew(re->prob_sum, re->nrepl);
+    snew(re->nexchange, re->nrepl);
+    snew(re->nmoves, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->nmoves[i], re->nrepl);
+    }
+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
+
+    /* generate space for the helper functions so we don't have to snew each time */
+
+    snew(re->destinations, re->nrepl);
+    snew(re->incycle, re->nrepl);
+    snew(re->tmpswap, re->nrepl);
+    snew(re->cyclic, re->nrepl);
+    snew(re->order, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->cyclic[i], re->nrepl);
+        snew(re->order[i], re->nrepl);
+    }
+    /* allocate space for the functions storing the data for the replicas */
+    /* not all of these arrays needed in all cases, but they don't take
+       up much space, since the max size is nrepl**2 */
+    snew(re->prob, re->nrepl);
+    snew(re->bEx, re->nrepl);
+    snew(re->beta, re->nrepl);
+    snew(re->Vol, re->nrepl);
+    snew(re->Epot, re->nrepl);
+    snew(re->de, re->nrepl);
+    for (i = 0; i < re->nrepl; i++)
+    {
+        snew(re->de[i], re->nrepl);
+    }
+    re->nex = nex;
+    return re;
+}
+
+static void exchange_reals(const gmx_multisim_t *ms, int b, real *v, int n)
+{
+    real *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+
+static void exchange_ints(const gmx_multisim_t *ms, int b, int *v, int n)
+{
+    int *buf;
+    int  i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_doubles(const gmx_multisim_t *ms, int b, double *v, int n)
+{
+    double *buf;
+    int     i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            v[i] = buf[i];
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_rvecs(const gmx_multisim_t *ms, int b, rvec *v, int n)
+{
+    rvec *buf;
+    int   i;
+
+    if (v)
+    {
+        snew(buf, n);
+#ifdef GMX_MPI
+        /*
+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+         */
+        {
+            MPI_Request mpi_req;
+
+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                      ms->mpi_comm_masters, &mpi_req);
+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+        }
+#endif
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(buf[i], v[i]);
+        }
+        sfree(buf);
+    }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    exchange_rvecs(ms, b, state->box, DIM);
+    exchange_rvecs(ms, b, state->box_rel, DIM);
+    exchange_rvecs(ms, b, state->boxv, DIM);
+    exchange_reals(ms, b, &(state->veta), 1);
+    exchange_reals(ms, b, &(state->vol0), 1);
+    exchange_rvecs(ms, b, state->svir_prev, DIM);
+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+    exchange_rvecs(ms, b, state->pres_prev, DIM);
+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+    exchange_rvecs(ms, b, state->x, state->natoms);
+    exchange_rvecs(ms, b, state->v, state->natoms);
+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
+}
+
+static void copy_rvecs(rvec *s, rvec *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            copy_rvec(s[i], d[i]);
+        }
+    }
+}
+
+static void copy_doubles(const double *s, double *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_reals(const real *s, real *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+static void copy_ints(const int *s, int *d, int n)
+{
+    int i;
+
+    if (d != NULL)
+    {
+        for (i = 0; i < n; i++)
+        {
+            d[i] = s[i];
+        }
+    }
+}
+
+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
+
+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+{
+    /* When t_state changes, this code should be updated. */
+    int ngtc, nnhpres;
+    ngtc    = state->ngtc * state->nhchainlength;
+    nnhpres = state->nnhpres* state->nhchainlength;
+    scopy_rvecs(box, DIM);
+    scopy_rvecs(box_rel, DIM);
+    scopy_rvecs(boxv, DIM);
+    state_local->veta = state->veta;
+    state_local->vol0 = state->vol0;
+    scopy_rvecs(svir_prev, DIM);
+    scopy_rvecs(fvir_prev, DIM);
+    scopy_rvecs(pres_prev, DIM);
+    scopy_doubles(nosehoover_xi, ngtc);
+    scopy_doubles(nosehoover_vxi, ngtc);
+    scopy_doubles(nhpres_xi, nnhpres);
+    scopy_doubles(nhpres_vxi, nnhpres);
+    scopy_doubles(therm_integral, state->ngtc);
+    scopy_rvecs(x, state->natoms);
+    scopy_rvecs(v, state->natoms);
+    scopy_rvecs(sd_X, state->natoms);
+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+    scopy_reals(lambda, efptNR);
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+    int i;
+
+    if (state->v)
+    {
+        for (i = 0; i < state->natoms; i++)
+        {
+            svmul(fac, state->v[i], state->v[i]);
+        }
+    }
+}
+
+static void pd_collect_state(const t_commrec *cr, t_state *state)
+{
+    int shift;
+
+    if (debug)
+    {
+        fprintf(debug, "Collecting state before exchange\n");
+    }
+    shift = cr->nnodes - cr->npmenodes - 1;
+    move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->x, NULL, shift, NULL);
+    if (state->v)
+    {
+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->v, NULL, shift, NULL);
+    }
+    if (state->sd_X)
+    {
+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->sd_X, NULL, shift, NULL);
+    }
+}
+
+static void print_transition_matrix(FILE *fplog, const char *leg, int n, int **nmoves, int *nattempt)
+{
+    int   i, j, ntot;
+    float Tprint;
+
+    ntot = nattempt[0] + nattempt[1];
+    fprintf(fplog, "\n");
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "    ");  /* put the title closer to the center */
+    }
+    fprintf(fplog, "Empirical Transition Matrix\n");
+
+    fprintf(fplog, "Repl");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%8d", (i+1));
+    }
+    fprintf(fplog, "\n");
+
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "Repl");
+        for (j = 0; j < n; j++)
+        {
+            Tprint = 0.0;
+            if (nmoves[i][j] > 0)
+            {
+                Tprint = nmoves[i][j]/(2.0*ntot);
+            }
+            fprintf(fplog, "%8.4f", Tprint);
+        }
+        fprintf(fplog, "%3d\n", i);
+    }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *ind, int *pind, int *allswaps, int *tmpswap)
+{
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        tmpswap[i] = allswaps[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        allswaps[i] = tmpswap[pind[i]];
+    }
+
+    fprintf(fplog, "\nAccepted Exchanges:   ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", pind[i]);
+    }
+    fprintf(fplog, "\n");
+
+    /* the "Order After Exchange" is the state label corresponding to the configuration that
+       started in state listed in order, i.e.
+
+       3 0 1 2
+
+       means that the:
+       configuration starting in simulation 3 is now in simulation 0,
+       configuration starting in simulation 0 is now in simulation 1,
+       configuration starting in simulation 1 is now in simulation 2,
+       configuration starting in simulation 2 is now in simulation 3
+     */
+    fprintf(fplog, "Order After Exchange: ");
+    for (i = 0; i < n; i++)
+    {
+        fprintf(fplog, "%d ", allswaps[i]);
+    }
+    fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+    int  i;
+    char buf[8];
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        if (prob[i] >= 0)
+        {
+            sprintf(buf, "%4.2f", prob[i]);
+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+        }
+        else
+        {
+            fprintf(fplog, "     ");
+        }
+    }
+    fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+    int i;
+
+    fprintf(fplog, "Repl %2s ", leg);
+    for (i = 1; i < n; i++)
+    {
+        fprintf(fplog, " %4d", count[i]);
+    }
+    fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+    real   ediff, dpV, delta = 0;
+    real  *Epot = re->Epot;
+    real  *Vol  = re->Vol;
+    real **de   = re->de;
+    real  *beta = re->beta;
+
+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+       to the non permuted case */
+
+    switch (re->type)
+    {
+        case ereTEMP:
+            /*
+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+             */
+            ediff = Epot[b] - Epot[a];
+            delta = -(beta[bp] - beta[ap])*ediff;
+            break;
+        case ereLAMBDA:
+            /* two cases:  when we are permuted, and not.  */
+            /* non-permuted:
+               ediff =  E_new - E_old
+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+                     =  de[b][a] + de[a][b] */
+
+            /* permuted:
+               ediff =  E_new - E_old
+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+            /* but, in the current code implementation, we flip configurations, not indices . . .
+               So let's examine that.
+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+                     So the simple solution is to flip the
+                     position of perturbed and original indices in the tests.
+             */
+
+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+            delta = ediff*beta[a]; /* assume all same temperature in this case */
+            break;
+        case ereTL:
+            /* not permuted:  */
+            /* delta =  reduced E_new - reduced E_old
+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+            /* permuted (big breath!) */
+            /*   delta =  reduced E_new - reduced E_old
+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+            break;
+        default:
+            gmx_incons("Unknown replica exchange quantity");
+    }
+    if (bPrint)
+    {
+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+    }
+    if (re->bNPT)
+    {
+        /* revist the calculation for 5.0.  Might be some improvements. */
+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+        if (bPrint)
+        {
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\nb", dpV, delta + dpV);
+        }
+        delta += dpV;
+    }
+    return delta;
+}
+
+static void
+test_for_replica_exchange(FILE                 *fplog,
+                          const gmx_multisim_t *ms,
+                          struct gmx_repl_ex   *re,
+                          gmx_enerdata_t       *enerd,
+                          real                  vol,
+                          gmx_large_int_t       step,
+                          real                  time)
+{
+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
+    real      ediff = 0, delta = 0, dpV = 0;
+    gmx_bool  bPrint, bMultiEx;
+    gmx_bool *bEx      = re->bEx;
+    real     *prob     = re->prob;
+    int      *pind     = re->destinations; /* permuted index */
+    gmx_bool  bEpot    = FALSE;
+    gmx_bool  bDLambda = FALSE;
+    gmx_bool  bVol     = FALSE;
+
+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+
+    if (re->bNPT)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Vol[i] = 0;
+        }
+        bVol               = TRUE;
+        re->Vol[re->repl]  = vol;
+    }
+    if ((re->type == ereTEMP || re->type == ereTL))
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->Epot[i] = 0;
+        }
+        bEpot              = TRUE;
+        re->Epot[re->repl] = enerd->term[F_EPOT];
+        /* temperatures of different states*/
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+        }
+    }
+    else
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+        }
+    }
+    if (re->type == ereLAMBDA || re->type == ereTL)
+    {
+        bDLambda = TRUE;
+        /* lambda differences. */
+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+           minus the energy of the jth simulation in the jth Hamiltonian */
+        for (i = 0; i < re->nrepl; i++)
+        {
+            for (j = 0; j < re->nrepl; j++)
+            {
+                re->de[i][j] = 0;
+            }
+        }
+        for (i = 0; i < re->nrepl; i++)
+        {
+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+        }
+    }
+
+    /* now actually do the communication */
+    if (bVol)
+    {
+        gmx_sum_sim(re->nrepl, re->Vol, ms);
+    }
+    if (bEpot)
+    {
+        gmx_sum_sim(re->nrepl, re->Epot, ms);
+    }
+    if (bDLambda)
+    {
+        for (i = 0; i < re->nrepl; i++)
+        {
+            gmx_sum_sim(re->nrepl, re->de[i], ms);
+        }
+    }
+
+    /* make a duplicate set of indices for shuffling */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        pind[i] = re->ind[i];
+    }
+
+    if (bMultiEx)
+    {
+        /* multiple random switch exchange */
+        for (i = 0; i < re->nex; i++)
+        {
+            /* randomly select a pair  */
+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+               probability of occurring (log p > -100) and only operate on those switches */
+            /* find out which state it is from, and what label that state currently has. Likely
+               more work that useful. */
+            i0 = (int)(re->nrepl*rando(&(re->seed)));
+            i1 = (int)(re->nrepl*rando(&(re->seed)));
+            if (i0 == i1)
+            {
+                i--;
+                continue;  /* self-exchange, back up and do it again */
+            }
+
+            a  = re->ind[i0]; /* what are the indices of these states? */
+            b  = re->ind[i1];
+            ap = pind[i0];
+            bp = pind[i1];
+
+            bPrint = FALSE; /* too noisy */
+            /* calculate the energy difference */
+            /* if the code changes to flip the STATES, rather than the configurations,
+               use the commented version of the code */
+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+            /* we actually only use the first space in the prob and bEx array,
+               since there are actually many switches between pairs. */
+
+            if (delta <= 0)
+            {
+                /* accepted */
+                prob[0] = 1;
+                bEx[0]  = TRUE;
+            }
+            else
+            {
+                if (delta > PROBABILITYCUTOFF)
+                {
+                    prob[0] = 0;
+                }
+                else
+                {
+                    prob[0] = exp(-delta);
+                }
+                /* roll a number to determine if accepted */
+                bEx[0] = (rando(&(re->seed)) < prob[0]);
+            }
+            re->prob_sum[0] += prob[0];
+
+            if (bEx[0])
+            {
+                /* swap the states */
+                tmp      = pind[i0];
+                pind[i0] = pind[i1];
+                pind[i1] = tmp;
+            }
+        }
+        re->nattempt[0]++;  /* keep track of total permutation trials here */
+        print_allswitchind(fplog, re->nrepl, re->ind, pind, re->allswaps, re->tmpswap);
+    }
+    else
+    {
+        /* standard nearest neighbor replica exchange */
+        m = (step / re->nst) % 2;
+        for (i = 1; i < re->nrepl; i++)
+        {
+            a = re->ind[i-1];
+            b = re->ind[i];
+
+            bPrint = (re->repl == a || re->repl == b);
+            if (i % 2 == m)
+            {
+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+                if (delta <= 0)
+                {
+                    /* accepted */
+                    prob[i] = 1;
+                    bEx[i]  = TRUE;
+                }
+                else
+                {
+                    if (delta > PROBABILITYCUTOFF)
+                    {
+                        prob[i] = 0;
+                    }
+                    else
+                    {
+                        prob[i] = exp(-delta);
+                    }
+                    /* roll a number to determine if accepted */
+                    bEx[i] = (rando(&(re->seed)) < prob[i]);
+                }
+                re->prob_sum[i] += prob[i];
+
+                if (bEx[i])
+                {
+                    /* swap these two */
+                    tmp       = pind[i-1];
+                    pind[i-1] = pind[i];
+                    pind[i]   = tmp;
+                    re->nexchange[i]++;  /* statistics for back compatibility */
+                }
+            }
+            else
+            {
+                prob[i] = -1;
+                bEx[i]  = FALSE;
+            }
+        }
+        /* print some statistics */
+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+        print_prob(fplog, "pr", re->nrepl, prob);
+        fprintf(fplog, "\n");
+        re->nattempt[m]++;
+    }
+
+    /* record which moves were made and accepted */
+    for (i = 0; i < re->nrepl; i++)
+    {
+        re->nmoves[re->ind[i]][pind[i]] += 1;
+        re->nmoves[pind[i]][re->ind[i]] += 1;
+    }
+    fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void write_debug_x(t_state *state)
+{
+    int i;
+
+    if (debug)
+    {
+        for (i = 0; i < state->natoms; i += 10)
+        {
+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
+        }
+    }
+}
+
+static void
+cyclic_decomposition(FILE      *fplog,
+                     const int *destinations,
+                     int      **cyclic,
+                     gmx_bool  *incycle,
+                     const int  nrepl,
+                     int       *nswap)
+{
+
+    int i, j, c, p;
+    int maxlen = 1;
+    for (i = 0; i < nrepl; i++)
+    {
+        incycle[i] = FALSE;
+    }
+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+    {
+        if (incycle[i])
+        {
+            cyclic[i][0] = -1;
+            continue;
+        }
+        cyclic[i][0] = i;
+        incycle[i]   = TRUE;
+        c            = 1;
+        p            = i;
+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+        {
+            p = destinations[p];    /* start permuting */
+            if (p == i)
+            {
+                cyclic[i][c] = -1;
+                if (c > maxlen)
+                {
+                    maxlen = c;
+                }
+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+            }
+            else
+            {
+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+                incycle[p]   = TRUE;
+                c++;
+            }
+        }
+    }
+    *nswap = maxlen - 1;
+
+    if (debug)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(debug, "Cycle %d:", i);
+            for (j = 0; j < nrepl; j++)
+            {
+                if (cyclic[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", cyclic[i][j]);
+            }
+            fprintf(debug, "\n");
+        }
+        fflush(debug);
+    }
+}
+
+static void
+compute_exchange_order(FILE     *fplog,
+                       int     **cyclic,
+                       int     **order,
+                       const int nrepl,
+                       const int maxswap)
+{
+    int i, j;
+
+    for (j = 0; j < maxswap; j++)
+    {
+        for (i = 0; i < nrepl; i++)
+        {
+            if (cyclic[i][j+1] >= 0)
+            {
+                order[cyclic[i][j+1]][j] = cyclic[i][j];
+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+            }
+        }
+        for (i = 0; i < nrepl; i++)
+        {
+            if (order[i][j] < 0)
+            {
+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(fplog, "Replica Exchange Order\n");
+        for (i = 0; i < nrepl; i++)
+        {
+            fprintf(fplog, "Replica %d:", i);
+            for (j = 0; j < maxswap; j++)
+            {
+                if (order[i][j] < 0)
+                {
+                    break;
+                }
+                fprintf(debug, "%2d", order[i][j]);
+            }
+            fprintf(fplog, "\n");
+        }
+        fflush(fplog);
+    }
+}
+
+static void
+prepare_to_do_exchange(FILE      *fplog,
+                       const int *destinations,
+                       const int  replica_id,
+                       const int  nrepl,
+                       int       *maxswap,
+                       int      **order,
+                       int      **cyclic,
+                       int       *incycle,
+                       gmx_bool  *bThisReplicaExchanged)
+{
+    int i, j;
+    /* Hold the cyclic decomposition of the (multiple) replica
+     * exchange. */
+    gmx_bool bAnyReplicaExchanged = FALSE;
+    *bThisReplicaExchanged = FALSE;
+
+    for (i = 0; i < nrepl; i++)
+    {
+        if (destinations[i] != i)
+        {
+            /* only mark as exchanged if the index has been shuffled */
+            bAnyReplicaExchanged = TRUE;
+            break;
+        }
+    }
+    if (bAnyReplicaExchanged)
+    {
+        /* reinitialize the placeholder arrays */
+        for (i = 0; i < nrepl; i++)
+        {
+            for (j = 0; j < nrepl; j++)
+            {
+                cyclic[i][j] = -1;
+                order[i][j]  = -1;
+            }
+        }
+
+        /* Identify the cyclic decomposition of the permutation (very
+         * fast if neighbor replica exchange). */
+        cyclic_decomposition(fplog, destinations, cyclic, incycle, nrepl, maxswap);
+
+        /* Now translate the decomposition into a replica exchange
+         * order at each step. */
+        compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+
+        /* Did this replica do any exchange at any point? */
+        for (j = 0; j < *maxswap; j++)
+        {
+            if (replica_id != order[replica_id][j])
+            {
+                *bThisReplicaExchanged = TRUE;
+                break;
+            }
+        }
+    }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+                          t_state *state, gmx_enerdata_t *enerd,
+                          t_state *state_local, gmx_large_int_t step, real time)
+{
+    int i, j;
+    int replica_id = 0;
+    int exchange_partner;
+    int maxswap = 0;
+    /* Number of rounds of exchanges needed to deal with any multiple
+     * exchanges. */
+    /* Where each replica ends up after the exchange attempt(s). */
+    /* The order in which multiple exchanges will occur. */
+    gmx_bool bThisReplicaExchanged = FALSE;
+
+    if (MASTER(cr))
+    {
+        replica_id  = re->repl;
+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+        prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
+                               re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+    }
+    /* Do intra-simulation broadcast so all processors belonging to
+     * each simulation know whether they need to participate in
+     * collecting the state. Otherwise, they might as well get on with
+     * the next thing to do. */
+    if (PAR(cr))
+    {
+#ifdef GMX_MPI
+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+                  cr->mpi_comm_mygroup);
+#endif
+    }
+
+    if (bThisReplicaExchanged)
+    {
+        /* Exchange the states */
+
+        if (PAR(cr))
+        {
+            /* Collect the global state on the master node */
+            if (DOMAINDECOMP(cr))
+            {
+                dd_collect_state(cr->dd, state_local, state);
+            }
+            else
+            {
+                pd_collect_state(cr, state);
+            }
+        }
+
+        if (MASTER(cr))
+        {
+            /* There will be only one swap cycle with standard replica
+             * exchange, but there may be multiple swap cycles if we
+             * allow multiple swaps. */
+
+            for (j = 0; j < maxswap; j++)
+            {
+                exchange_partner = re->order[replica_id][j];
+
+                if (exchange_partner != replica_id)
+                {
+                    /* Exchange the global states between the master nodes */
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
+                    exchange_state(cr->ms, exchange_partner, state);
+                }
+            }
+            /* For temperature-type replica exchange, we need to scale
+             * the velocities. */
+            if (re->type == ereTEMP || re->type == ereTL)
+            {
+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+            }
+
+        }
+
+        /* With domain decomposition the global state is distributed later */
+        if (!DOMAINDECOMP(cr))
+        {
+            /* Copy the global state to the local state data structure */
+            copy_state_nonatomdata(state, state_local);
+
+            if (PAR(cr))
+            {
+                bcast_state(cr, state, FALSE);
+            }
+        }
+    }
+
+    return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+    int  i;
+
+    fprintf(fplog, "\nReplica exchange statistics\n");
+
+    if (re->nex == 0)
+    {
+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+        fprintf(fplog, "Repl  average probabilities:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "Repl  number of exchanges:\n");
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_count(fplog, "", re->nrepl, re->nexchange);
+
+        fprintf(fplog, "Repl  average number of exchanges:\n");
+        for (i = 1; i < re->nrepl; i++)
+        {
+            if (re->nattempt[i%2] == 0)
+            {
+                re->prob[i] = 0;
+            }
+            else
+            {
+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+            }
+        }
+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+        print_prob(fplog, "", re->nrepl, re->prob);
+
+        fprintf(fplog, "\n");
+    }
+    /* print the transition matrix */
+    print_transition_matrix(fplog, "", re->nrepl, re->nmoves, re->nattempt);
+}
diff --git a/patches/patch.sh b/patches/patch.sh
index 082026b24f091509b1cc3c49d7fde72897bbdb64..fc77435c8b6ca6db9b5eff1b8972fcafa9b3c6a4 100755
--- a/patches/patch.sh
+++ b/patches/patch.sh
@@ -13,6 +13,8 @@ Actions (choose one):
                     print a list of available MD engines
   -s, --save
                     save, this needs *.preplumed files (*)
+  --save-originals
+                    same as save, but save also original files
   -n NEWENGINE, --new NEWENGINE
                     create a new patch named NEWENGINE (*)
 Options:
@@ -45,6 +47,10 @@ newpatch=
 
 multiple_actions=
 
+otherfiles=
+save_originals=
+
+
 for option
 do
 
@@ -55,6 +61,7 @@ do
     (--help|-h)         echo "$MANUAL" ; exit ;;
     (--patch|-p)        test -n "$action" && multiple_actions=yes ; action=patch ;;
     (--save|-s)         test -n "$action" && multiple_actions=yes ; action=save ;;
+    (--save-originals)  test -n "$action" && multiple_actions=yes ; action=save ; save_originals=yes ;;
     (--revert|-R|-r)    test -n "$action" && multiple_actions=yes ; action=revert ;;
     (--list-engines|-l) test -n "$action" && multiple_actions=yes ; action=list ;;
     (--new=*)           test -n "$action" && multiple_actions=yes ; action=new ; newpatch="${prefix_option#--new=}" ;;
@@ -187,7 +194,7 @@ esac
 
 case "$action" in
   (patch)
-    if [ ! -f "$diff" ] ; then
+    if [ ! -e "$diff" ] ; then
       echo "ERROR: MD engine not supported (or mispelled)"
       exit
     fi
@@ -220,7 +227,24 @@ case "$action" in
     ln -s "$PLUMED_ROOT/src/wrapper/Plumed.h"
     ln -s "$PLUMED_ROOT/src/lib/Plumed.inc.$mode" Plumed.inc
     ln -s "$PLUMED_ROOT/src/lib/Plumed.cmake.$mode" Plumed.cmake
-    bash "$diff"
+
+    if [ -d "$diff" ]; then
+      echo "Patching with on-the-fly diff from stored originals"
+      PREPLUMED=$(cd $diff ; find . -name "*.preplumed" | sort)
+      for bckfile in $PREPLUMED ; do
+        file="${bckfile%.preplumed}"
+        if test -e "$file" ; then
+          diff -U 5 "$diff/$bckfile" "$diff/$file" --label="$bckfile" --label="$file" |
+          patch -u -l -b -F 5 --suffix=.preplumed "$file"
+        else
+          echo "ERROR: File $file is missing"
+        fi
+      done
+    else
+      echo "Patching with stored diff"
+      bash "$diff"
+    fi
+    
     if type -t plumed_after_patch 1>/dev/null ; then
       echo "Executing plumed_after_patch function"
       plumed_after_patch
@@ -250,13 +274,42 @@ case "$action" in
     echo "Saving your changes to $diff"
     echo "Preplumed files:"
     echo "$PREPLUMED"
-    test -e "$diff" && rm "$diff"
+    if [ -d "$diff" ] && [ -z "$save_originals" ]; then
+      echo "This patch uses the dir format (originals are saved)"
+      echo "Are you sure you want to save the single diff?"
+      echo "Possible reasons to do it are:"
+      echo "* because of licence you do not want to store originals in plumed"
+      echo "* you do not want to do a big change on the diff files now"
+      answer=
+      PS3="Choose:"
+      select answer in single-diff originals ; do
+        if [[ -n "$answer" ]] ; then
+          break
+        else
+          echo "ERROR: choose in the list above or interrupt (^c)"
+        fi
+      done
+      if [ "$answer" = originals ] ; then
+        echo "saving originals"
+        save_originals=yes
+      else
+        echo "saving single diff file"
+      fi
+    fi
+    test -e "$diff" && rm -r "$diff"
     for bckfile in $PREPLUMED ; do
       file="${bckfile%.preplumed}"
       if test -e "$file" ; then
-        echo "patch -u -l -b -F 5 --suffix=.preplumed \"${file}\" << \\EOF_EOF" >> "$diff"
-        diff -U 5 "${bckfile}" "$file" --label="$bckfile" --label="$file" >> "$diff"
-        echo "EOF_EOF"                                                   >> "$diff"
+        if [ -n "$save_originals" ] ; then
+          xx="$diff/$file"
+          mkdir -p "${xx%/*}"
+          cp "$file" "$diff/$file"
+          cp "$bckfile" "$diff/$bckfile"
+        else
+          echo "patch -u -l -b -F 5 --suffix=.preplumed \"${file}\" << \\EOF_EOF" >> "$diff"
+          diff -U 5 "${bckfile}" "$file" --label="$bckfile" --label="$file" >> "$diff"
+          echo "EOF_EOF"                                                   >> "$diff"
+        fi
       else
         echo "ERROR: File $file is missing"
       fi
@@ -270,7 +323,7 @@ cat <<EOF
 EOF
   ;;
   (revert)
-    if [ ! -f "$diff" ] ; then
+    if [ ! -e "$diff" ] ; then
       echo "ERROR: MD engine not supported (or mispelled)"
       exit
     fi