diff --git a/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt b/patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt
similarity index 98%
rename from patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt
rename to patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt
index 3ed862a7cab99b767626ee93d3b2fdf298f0abf4..8e108b361e4e81da6bda32e85095a23533b89bf1 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt
+++ b/patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt
@@ -78,7 +78,7 @@ set(MDRUN_SOURCES
 
 add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
 target_link_libraries(gmxpreprocess md)
-set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION}
     COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
 
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt.preplumed b/patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt.preplumed
similarity index 98%
rename from patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt.preplumed
rename to patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt.preplumed
index e2125333387440d266eb0e6e77f23e8adb635511..fea8282ef4fade62c8a764455b21dae78f277236 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt.preplumed
+++ b/patches/gromacs-4.6.6.diff/src/kernel/CMakeLists.txt.preplumed
@@ -76,7 +76,7 @@ set(MDRUN_SOURCES
 
 add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
 target_link_libraries(gmxpreprocess md)
-set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION}
     COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
 
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/md.c b/patches/gromacs-4.6.6.diff/src/kernel/md.c
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/md.c
rename to patches/gromacs-4.6.6.diff/src/kernel/md.c
index a96f3bcd40e5f62081515b9a7980411be94a632c..741986ee9998d46db3024f82590f55cdebd0d8cb 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/md.c
+++ b/patches/gromacs-4.6.6.diff/src/kernel/md.c
@@ -358,6 +358,25 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                  (ir->bContinuation ||
                                   (DOMAINDECOMP(cr) && !MASTER(cr))) ?
                                  NULL : state_global->x);
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "In order to run parallel simulations with shells you need to use the -pd flag to mdrun.");
+    }
+    if (shellfc && ir->eI == eiNM)
+    {
+        /* Currently shells don't work with Normal Modes */
+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
+    }
+
+    if (vsite && ir->eI == eiNM)
+    {
+        /* Currently virtual sites don't work with Normal Modes */
+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
+    }
 
     if (DEFORM(*ir))
     {
@@ -726,10 +745,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
     /* PLUMED */
     if(plumedswitch){
-/* detect plumed API version */
+      /* detect plumed API version */
       int pversion=0;
       plumed_cmd(plumedmain,"getApiVersion",&pversion);
-/* setting kbT is only implemented with api>1) */
+      /* setting kbT is only implemented with api>1) */
       real kbT=ir->opts.ref_t[0]*BOLTZ;
       if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
 
@@ -771,14 +790,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     }
     /* END PLUMED */
 
-    /* Set and write start time */
+    print_start(fplog, cr, runtime, "mdrun");
     runtime_start(runtime);
-    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
     wallcycle_start(wcycle, ewcRUN);
-    if (fplog)
-    {
-        fprintf(fplog, "\n");
-    }
 
     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 #ifdef GMX_FAHCORE
@@ -1314,11 +1328,15 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              * step to combine the long-range forces on these steps.
              * For nstcalclr=1 this is not done, since the forces would have been added
              * directly to the short-range forces already.
+             *
+             * TODO Remove various aspects of VV+twin-range in master
+             * branch, because VV integrators did not ever support
+             * twin-range multiple time stepping with constraints.
              */
             bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
             update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-                          f, bUpdateDoLR, fr->f_twin, fcd,
+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                           ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
                           cr, nrnb, constr, &top->idef);
 
@@ -1369,6 +1387,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                        cr, nrnb, wcycle, upd, constr,
                                        bInitStep, TRUE, bCalcVir, vetanew);
 
+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+                    {
+                        /* Correct the virial for multiple time stepping */
+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+                    }
+
                     if (!bOK && !bFFscan)
                     {
                         gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
@@ -1553,6 +1577,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             fcReportProgress( ir->nsteps, step );
         }
 
+#if defined(__native_client__)
+        fcCheckin(MASTER(cr));
+#endif
+
         /* sync bCPT and fc record-keeping */
         if (bCPT && MASTER(cr))
         {
@@ -1731,7 +1759,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
             {
                 gmx_bool bIfRandomize;
-                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr, DOMAINDECOMP(cr));
                 /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
                 if (constr && bIfRandomize)
                 {
@@ -1816,7 +1844,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
                     /* velocity half-step update */
                     update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                                   ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
                                   cr, nrnb, constr, &top->idef);
                 }
@@ -1833,7 +1861,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                 update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                              bUpdateDoLR, fr->f_twin, fcd,
+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                               ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
@@ -1843,6 +1871,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                    cr, nrnb, wcycle, upd, constr,
                                    bInitStep, FALSE, bCalcVir, state->veta);
 
+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+                {
+                    /* Correct the virial for multiple time stepping */
+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+                }
+
                 if (ir->eI == eiVVAK)
                 {
                     /* erase F_EKIN and F_TEMP here? */
@@ -1861,7 +1895,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                     update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                                   ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                     wallcycle_stop(wcycle, ewcUPDATE);
 
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed b/patches/gromacs-4.6.6.diff/src/kernel/md.c.preplumed
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed
rename to patches/gromacs-4.6.6.diff/src/kernel/md.c.preplumed
index 81fca912a8746a7b6f297836762a0e5912f7ad86..4c4a88c174dbd8b47337a13fed1d23c3dadf4c76 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed
+++ b/patches/gromacs-4.6.6.diff/src/kernel/md.c.preplumed
@@ -347,6 +347,25 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                  (ir->bContinuation ||
                                   (DOMAINDECOMP(cr) && !MASTER(cr))) ?
                                  NULL : state_global->x);
+    if (shellfc && ir->nstcalcenergy != 1)
+    {
+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
+    }
+    if (shellfc && DOMAINDECOMP(cr))
+    {
+        gmx_fatal(FARGS, "In order to run parallel simulations with shells you need to use the -pd flag to mdrun.");
+    }
+    if (shellfc && ir->eI == eiNM)
+    {
+        /* Currently shells don't work with Normal Modes */
+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
+    }
+
+    if (vsite && ir->eI == eiNM)
+    {
+        /* Currently virtual sites don't work with Normal Modes */
+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
+    }
 
     if (DEFORM(*ir))
     {
@@ -713,14 +732,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         fprintf(fplog, "\n");
     }
 
-    /* Set and write start time */
+    print_start(fplog, cr, runtime, "mdrun");
     runtime_start(runtime);
-    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
     wallcycle_start(wcycle, ewcRUN);
-    if (fplog)
-    {
-        fprintf(fplog, "\n");
-    }
 
     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 #ifdef GMX_FAHCORE
@@ -1222,11 +1236,15 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              * step to combine the long-range forces on these steps.
              * For nstcalclr=1 this is not done, since the forces would have been added
              * directly to the short-range forces already.
+             *
+             * TODO Remove various aspects of VV+twin-range in master
+             * branch, because VV integrators did not ever support
+             * twin-range multiple time stepping with constraints.
              */
             bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
             update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-                          f, bUpdateDoLR, fr->f_twin, fcd,
+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                           ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
                           cr, nrnb, constr, &top->idef);
 
@@ -1277,6 +1295,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                        cr, nrnb, wcycle, upd, constr,
                                        bInitStep, TRUE, bCalcVir, vetanew);
 
+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+                    {
+                        /* Correct the virial for multiple time stepping */
+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+                    }
+
                     if (!bOK && !bFFscan)
                     {
                         gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
@@ -1461,6 +1485,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             fcReportProgress( ir->nsteps, step );
         }
 
+#if defined(__native_client__)
+        fcCheckin(MASTER(cr));
+#endif
+
         /* sync bCPT and fc record-keeping */
         if (bCPT && MASTER(cr))
         {
@@ -1639,7 +1667,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
             {
                 gmx_bool bIfRandomize;
-                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr, DOMAINDECOMP(cr));
                 /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
                 if (constr && bIfRandomize)
                 {
@@ -1724,7 +1752,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
                     /* velocity half-step update */
                     update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                                   ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
                                   cr, nrnb, constr, &top->idef);
                 }
@@ -1741,7 +1769,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                 update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                              bUpdateDoLR, fr->f_twin, fcd,
+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                               ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
@@ -1751,6 +1779,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                    cr, nrnb, wcycle, upd, constr,
                                    bInitStep, FALSE, bCalcVir, state->veta);
 
+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+                {
+                    /* Correct the virial for multiple time stepping */
+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+                }
+
                 if (ir->eI == eiVVAK)
                 {
                     /* erase F_EKIN and F_TEMP here? */
@@ -1769,7 +1803,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                     update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, fcd,
+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
                                   ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                     wallcycle_stop(wcycle, ewcUPDATE);
 
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c b/patches/gromacs-4.6.6.diff/src/kernel/mdrun.c
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/mdrun.c
rename to patches/gromacs-4.6.6.diff/src/kernel/mdrun.c
index e57c646b6155a6cdfb11317d3d4a806658b1980a..ca3b6574d221018a65ba557f97c1d4d2116a7c5f 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c
+++ b/patches/gromacs-4.6.6.diff/src/kernel/mdrun.c
@@ -106,10 +106,16 @@ int cmain(int argc, char *argv[])
         "With thread-MPI there are additional options [TT]-nt[tt], which sets",
         "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
         "of thread-MPI threads.",
-        "Note that using combined MPI+OpenMP parallelization is almost always",
-        "slower than single parallelization, except at the scaling limit, where",
-        "especially OpenMP parallelization of PME reduces the communication cost.",
-        "OpenMP-only parallelization is much faster than MPI-only parallelization",
+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
+        "the number of threads used by the PME-only processes.[PAR]",
+        "Note that combined MPI+OpenMP parallelization is in many cases",
+        "slower than either on its own. However, at high parallelization, using the",
+        "combination is often beneficial as it reduces the number of domains and/or",
+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
+        "with separate PME processes fewer MPI ranks reduces communication cost.)",
+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
         "on a single CPU(-die). Since we currently don't have proper hardware",
         "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
         "automatically use OpenMP-only parallelization when you use up to 4",
@@ -199,13 +205,16 @@ int cmain(int argc, char *argv[])
         "[PAR]",
         "When PME is used with domain decomposition, separate nodes can",
         "be assigned to do only the PME mesh calculation;",
-        "this is computationally more efficient starting at about 12 nodes.",
+        "this is computationally more efficient starting at about 12 nodes",
+        "or even fewer when OpenMP parallelization is used.",
         "The number of PME nodes is set with option [TT]-npme[tt],",
         "this can not be more than half of the nodes.",
         "By default [TT]mdrun[tt] makes a guess for the number of PME",
-        "nodes when the number of nodes is larger than 11 or performance wise",
-        "not compatible with the PME grid x dimension.",
-        "But the user should optimize npme. Performance statistics on this issue",
+        "nodes when the number of nodes is larger than 16. With GPUs,",
+        "PME nodes are not selected automatically, since the optimal setup",
+        "depends very much on the details of the hardware.",
+        "In all cases you might gain performance by optimizing [TT]-npme[tt].",
+        "Performance statistics on this issue",
         "are written at the end of the log file.",
         "For good load balancing at high parallelization, the PME grid x and y",
         "dimensions should be divisible by the number of PME nodes",
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed b/patches/gromacs-4.6.6.diff/src/kernel/mdrun.c.preplumed
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed
rename to patches/gromacs-4.6.6.diff/src/kernel/mdrun.c.preplumed
index 2ccc6f67469c0aab5263ec48636818785073f661..eb30fc99c6c6c360231f0b321772261f79b386c0 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed
+++ b/patches/gromacs-4.6.6.diff/src/kernel/mdrun.c.preplumed
@@ -100,10 +100,16 @@ int cmain(int argc, char *argv[])
         "With thread-MPI there are additional options [TT]-nt[tt], which sets",
         "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
         "of thread-MPI threads.",
-        "Note that using combined MPI+OpenMP parallelization is almost always",
-        "slower than single parallelization, except at the scaling limit, where",
-        "especially OpenMP parallelization of PME reduces the communication cost.",
-        "OpenMP-only parallelization is much faster than MPI-only parallelization",
+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
+        "the number of threads used by the PME-only processes.[PAR]",
+        "Note that combined MPI+OpenMP parallelization is in many cases",
+        "slower than either on its own. However, at high parallelization, using the",
+        "combination is often beneficial as it reduces the number of domains and/or",
+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
+        "with separate PME processes fewer MPI ranks reduces communication cost.)",
+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
         "on a single CPU(-die). Since we currently don't have proper hardware",
         "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
         "automatically use OpenMP-only parallelization when you use up to 4",
@@ -193,13 +199,16 @@ int cmain(int argc, char *argv[])
         "[PAR]",
         "When PME is used with domain decomposition, separate nodes can",
         "be assigned to do only the PME mesh calculation;",
-        "this is computationally more efficient starting at about 12 nodes.",
+        "this is computationally more efficient starting at about 12 nodes",
+        "or even fewer when OpenMP parallelization is used.",
         "The number of PME nodes is set with option [TT]-npme[tt],",
         "this can not be more than half of the nodes.",
         "By default [TT]mdrun[tt] makes a guess for the number of PME",
-        "nodes when the number of nodes is larger than 11 or performance wise",
-        "not compatible with the PME grid x dimension.",
-        "But the user should optimize npme. Performance statistics on this issue",
+        "nodes when the number of nodes is larger than 16. With GPUs,",
+        "PME nodes are not selected automatically, since the optimal setup",
+        "depends very much on the details of the hardware.",
+        "In all cases you might gain performance by optimizing [TT]-npme[tt].",
+        "Performance statistics on this issue",
         "are written at the end of the log file.",
         "For good load balancing at high parallelization, the PME grid x and y",
         "dimensions should be divisible by the number of PME nodes",
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c b/patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c
rename to patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c
index 9fa6dc5cea8a4847f536dbe99c4e19fee97f75c8..7f27136acd73ef02c830fd8fa6dab6712bb27071 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c
+++ b/patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c
@@ -278,6 +278,12 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
             {
                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
                 {
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
                     k          = re->ind[i];
                     re->ind[i] = re->ind[j];
                     re->ind[j] = k;
@@ -891,7 +897,7 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
         dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
         if (bPrint)
         {
-            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\nb", dpV, delta + dpV);
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
         }
         delta += dpV;
     }
@@ -918,7 +924,7 @@ test_for_replica_exchange(FILE                 *fplog,
     gmx_bool  bVol     = FALSE;
 
     bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %.5f\n", step, time);
 
     if (re->bNPT)
     {
@@ -1313,13 +1319,9 @@ compute_exchange_order(FILE     *fplog,
 
 static void
 prepare_to_do_exchange(FILE      *fplog,
-                       const int *destinations,
+                       struct gmx_repl_ex *re,
                        const int  replica_id,
-                       const int  nrepl,
                        int       *maxswap,
-                       int      **order,
-                       int      **cyclic,
-                       int       *incycle,
                        gmx_bool  *bThisReplicaExchanged)
 {
     int i, j;
@@ -1328,9 +1330,9 @@ prepare_to_do_exchange(FILE      *fplog,
     gmx_bool bAnyReplicaExchanged = FALSE;
     *bThisReplicaExchanged = FALSE;
 
-    for (i = 0; i < nrepl; i++)
+    for (i = 0; i < re->nrepl; i++)
     {
-        if (destinations[i] != i)
+        if (re->destinations[i] != re->ind[i])
         {
             /* only mark as exchanged if the index has been shuffled */
             bAnyReplicaExchanged = TRUE;
@@ -1340,27 +1342,27 @@ prepare_to_do_exchange(FILE      *fplog,
     if (bAnyReplicaExchanged)
     {
         /* reinitialize the placeholder arrays */
-        for (i = 0; i < nrepl; i++)
+        for (i = 0; i < re->nrepl; i++)
         {
-            for (j = 0; j < nrepl; j++)
+            for (j = 0; j < re->nrepl; j++)
             {
-                cyclic[i][j] = -1;
-                order[i][j]  = -1;
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
             }
         }
 
         /* Identify the cyclic decomposition of the permutation (very
          * fast if neighbor replica exchange). */
-        cyclic_decomposition(fplog, destinations, cyclic, incycle, nrepl, maxswap);
+        cyclic_decomposition(fplog, re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
 
         /* Now translate the decomposition into a replica exchange
          * order at each step. */
-        compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
 
         /* Did this replica do any exchange at any point? */
         for (j = 0; j < *maxswap; j++)
         {
-            if (replica_id != order[replica_id][j])
+            if (replica_id != re->order[replica_id][j])
             {
                 *bThisReplicaExchanged = TRUE;
                 break;
@@ -1391,8 +1393,7 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
     {
         replica_id  = re->repl;
         test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-        prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
-                               re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
     }
     /* Do intra-simulation broadcast so all processors belonging to
      * each simulation know whether they need to participate in
@@ -1440,10 +1441,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
                 if (exchange_partner != replica_id)
                 {
                     /* Exchange the global states between the master nodes */
-                    //if (debug)
-                    //{
-                        fprintf(fplog, "Exchanging %d with %d\n", replica_id, exchange_partner);
-                    //}
+                    if (debug)
+                    {
+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+                    }
                     exchange_state(cr->ms, exchange_partner, state);
                 }
             }
diff --git a/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed b/patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c.preplumed
similarity index 97%
rename from patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed
rename to patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c.preplumed
index 3bbb3f94ef4b0304e48cc62c7960ac130487625a..0f094d4e267aa65df321b1da7e12e548db2e96c2 100644
--- a/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed
+++ b/patches/gromacs-4.6.6.diff/src/kernel/repl_ex.c.preplumed
@@ -266,6 +266,16 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
             {
                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
                 {
+                    /* Unordered replicas are supposed to work, but there
+                     * is still an issues somewhere.
+                     * Note that at this point still re->ind[i]=i.
+                     */
+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
+                              i, j,
+                              erename[re->type],
+                              re->q[re->type][i], re->q[re->type][j],
+                              erename[re->type]);
+
                     k          = re->ind[i];
                     re->ind[i] = re->ind[j];
                     re->ind[j] = k;
@@ -877,7 +887,7 @@ static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int
         dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
         if (bPrint)
         {
-            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\nb", dpV, delta + dpV);
+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
         }
         delta += dpV;
     }
@@ -904,7 +914,7 @@ test_for_replica_exchange(FILE                 *fplog,
     gmx_bool  bVol     = FALSE;
 
     bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %.5f\n", step, time);
 
     if (re->bNPT)
     {
@@ -1238,13 +1248,9 @@ compute_exchange_order(FILE     *fplog,
 
 static void
 prepare_to_do_exchange(FILE      *fplog,
-                       const int *destinations,
+                       struct gmx_repl_ex *re,
                        const int  replica_id,
-                       const int  nrepl,
                        int       *maxswap,
-                       int      **order,
-                       int      **cyclic,
-                       int       *incycle,
                        gmx_bool  *bThisReplicaExchanged)
 {
     int i, j;
@@ -1253,9 +1259,9 @@ prepare_to_do_exchange(FILE      *fplog,
     gmx_bool bAnyReplicaExchanged = FALSE;
     *bThisReplicaExchanged = FALSE;
 
-    for (i = 0; i < nrepl; i++)
+    for (i = 0; i < re->nrepl; i++)
     {
-        if (destinations[i] != i)
+        if (re->destinations[i] != re->ind[i])
         {
             /* only mark as exchanged if the index has been shuffled */
             bAnyReplicaExchanged = TRUE;
@@ -1265,27 +1271,27 @@ prepare_to_do_exchange(FILE      *fplog,
     if (bAnyReplicaExchanged)
     {
         /* reinitialize the placeholder arrays */
-        for (i = 0; i < nrepl; i++)
+        for (i = 0; i < re->nrepl; i++)
         {
-            for (j = 0; j < nrepl; j++)
+            for (j = 0; j < re->nrepl; j++)
             {
-                cyclic[i][j] = -1;
-                order[i][j]  = -1;
+                re->cyclic[i][j] = -1;
+                re->order[i][j]  = -1;
             }
         }
 
         /* Identify the cyclic decomposition of the permutation (very
          * fast if neighbor replica exchange). */
-        cyclic_decomposition(fplog, destinations, cyclic, incycle, nrepl, maxswap);
+        cyclic_decomposition(fplog, re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
 
         /* Now translate the decomposition into a replica exchange
          * order at each step. */
-        compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
 
         /* Did this replica do any exchange at any point? */
         for (j = 0; j < *maxswap; j++)
         {
-            if (replica_id != order[replica_id][j])
+            if (replica_id != re->order[replica_id][j])
             {
                 *bThisReplicaExchanged = TRUE;
                 break;
@@ -1312,8 +1318,7 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
     {
         replica_id  = re->repl;
         test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-        prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
-                               re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
     }
     /* Do intra-simulation broadcast so all processors belonging to
      * each simulation know whether they need to participate in
diff --git a/patches/gromacs-4.6.5.diff/src/mdlib/force.c b/patches/gromacs-4.6.6.diff/src/mdlib/force.c
similarity index 100%
rename from patches/gromacs-4.6.5.diff/src/mdlib/force.c
rename to patches/gromacs-4.6.6.diff/src/mdlib/force.c
diff --git a/patches/gromacs-4.6.5.diff/src/mdlib/force.c.preplumed b/patches/gromacs-4.6.6.diff/src/mdlib/force.c.preplumed
similarity index 100%
rename from patches/gromacs-4.6.5.diff/src/mdlib/force.c.preplumed
rename to patches/gromacs-4.6.6.diff/src/mdlib/force.c.preplumed
diff --git a/patches/gromacs-4.6.5.diff/src/mdlib/minimize.c b/patches/gromacs-4.6.6.diff/src/mdlib/minimize.c
similarity index 99%
rename from patches/gromacs-4.6.5.diff/src/mdlib/minimize.c
rename to patches/gromacs-4.6.6.diff/src/mdlib/minimize.c
index 79f002127165bbd84e3c7dc6e9073f06ff6db335..f800e6225704db6ab644f108bac0ebab94e19380 100644
--- a/patches/gromacs-4.6.5.diff/src/mdlib/minimize.c
+++ b/patches/gromacs-4.6.6.diff/src/mdlib/minimize.c
@@ -67,6 +67,7 @@
 #include "force.h"
 #include "mdrun.h"
 #include "md_support.h"
+#include "sim_util.h"
 #include "domdec.h"
 #include "partdec.h"
 #include "trnio.h"
@@ -114,15 +115,11 @@ static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
                            gmx_wallcycle_t wcycle,
                            const char *name)
 {
-    char buf[STRLEN];
-
     runtime_start(runtime);
-
-    sprintf(buf, "Started %s", name);
-    print_date_and_time(fplog, cr->nodeid, buf, NULL);
-
     wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, runtime, name);
 }
+
 static void em_time_end(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
                         gmx_wallcycle_t wcycle)
 {
diff --git a/patches/gromacs-4.6.5.diff/src/mdlib/minimize.c.preplumed b/patches/gromacs-4.6.6.diff/src/mdlib/minimize.c.preplumed
similarity index 99%
rename from patches/gromacs-4.6.5.diff/src/mdlib/minimize.c.preplumed
rename to patches/gromacs-4.6.6.diff/src/mdlib/minimize.c.preplumed
index 5df3936cfbef96b8583e01a95ddd9e0b27c8efb0..8afe436f44640703d2c7c6a93bc90861ae5cd038 100644
--- a/patches/gromacs-4.6.5.diff/src/mdlib/minimize.c.preplumed
+++ b/patches/gromacs-4.6.6.diff/src/mdlib/minimize.c.preplumed
@@ -67,6 +67,7 @@
 #include "force.h"
 #include "mdrun.h"
 #include "md_support.h"
+#include "sim_util.h"
 #include "domdec.h"
 #include "partdec.h"
 #include "trnio.h"
@@ -108,15 +109,11 @@ static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
                            gmx_wallcycle_t wcycle,
                            const char *name)
 {
-    char buf[STRLEN];
-
     runtime_start(runtime);
-
-    sprintf(buf, "Started %s", name);
-    print_date_and_time(fplog, cr->nodeid, buf, NULL);
-
     wallcycle_start(wcycle, ewcRUN);
+    print_start(fplog, cr, runtime, name);
 }
+
 static void em_time_end(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
                         gmx_wallcycle_t wcycle)
 {