diff --git a/patches/gromacs-4.6.3.config b/patches/gromacs-4.6.5.config
similarity index 100%
rename from patches/gromacs-4.6.3.config
rename to patches/gromacs-4.6.5.config
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt b/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt
similarity index 100%
rename from patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt
rename to patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt.preplumed b/patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt.preplumed
similarity index 100%
rename from patches/gromacs-4.6.3.diff/src/kernel/CMakeLists.txt.preplumed
rename to patches/gromacs-4.6.5.diff/src/kernel/CMakeLists.txt.preplumed
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/md.c b/patches/gromacs-4.6.5.diff/src/kernel/md.c
similarity index 98%
rename from patches/gromacs-4.6.3.diff/src/kernel/md.c
rename to patches/gromacs-4.6.5.diff/src/kernel/md.c
index 187b2796a09a2cf80c82d9637db77093835ce358..5e7f1e7a8e54652f51a1a2b81e23869fb344d4c4 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/md.c
+++ b/patches/gromacs-4.6.5.diff/src/kernel/md.c
@@ -187,7 +187,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     int               nchkpt  = 1;
     gmx_localtop_t   *top;
     t_mdebin         *mdebin = NULL;
-    df_history_t      df_history;
     t_state          *state    = NULL;
     rvec             *f_global = NULL;
     int               n_xtc    = -1;
@@ -340,14 +339,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         snew(f, top_global->natoms);
     }
 
-    /* lambda Monte carlo random number generator  */
-    if (ir->bExpanded)
-    {
-        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
-    }
-    /* copy the state into df_history */
-    copy_df_history(&df_history, &state_global->dfhist);
-
     /* Kinetic energy data */
     snew(ekind, 1);
     init_ekindata(fplog, top_global, &(ir->opts), ekind);
@@ -442,7 +433,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             make_local_shells(cr, mdatoms, shellfc);
         }
 
-        init_bonded_thread_force_reduction(fr, &top->idef);
+        setup_bonded_threading(fr, &top->idef);
 
         if (ir->pull && PAR(cr))
         {
@@ -472,6 +463,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         bStateFromCP = FALSE;
     }
 
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(bStateFromCP,ir,&mcrng,&state->dfhist);
+    }
+
     if (MASTER(cr))
     {
         if (bStateFromCP)
@@ -908,7 +904,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
             bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
         }
 
         if (bSimAnn)
@@ -1495,7 +1492,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                statistics, but if performing simulated tempering, we
                do update the velocities and the tau_t. */
 
-            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, mcrng, state->v, mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            copy_df_history(&state_global->dfhist,&state->dfhist);
         }
         /* ################## START TRAJECTORY OUTPUT ################# */
 
@@ -1574,14 +1573,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                         state_global->ekinstate.bUpToDate = TRUE;
                     }
                     update_energyhistory(&state_global->enerhist, mdebin);
-                    if (ir->efep != efepNO || ir->bSimTemp)
-                    {
-                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
-                                                                       structured so this isn't necessary.
-                                                                       Note this reassignment is only necessary
-                                                                       for single threads.*/
-                        copy_df_history(&state_global->dfhist, &df_history);
-                    }
                 }
             }
             write_traj(fplog, cr, outf, mdof_flags, top_global,
@@ -2084,7 +2075,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             {
                 /* only needed if doing expanded ensemble */
                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-                                          &df_history, state->fep_state, ir->nstlog, step);
+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
             }
             if (!(bStartingFromCpt && (EI_VV(ir->eI))))
             {
@@ -2123,13 +2114,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         }
         if (bDoExpanded)
         {
-            /* Have to do this part after outputting the logfile and the edr file */
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
             state->fep_state = lamnew;
-            for (i = 0; i < efptNR; i++)
-            {
-                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
-            }
         }
+
         /* Remaining runtime */
         if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
         {
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed b/patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed
similarity index 98%
rename from patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed
rename to patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed
index f82e6974ca5095ff0eb4b7268ddf9b63798ca64d..81fca912a8746a7b6f297836762a0e5912f7ad86 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/md.c.preplumed
+++ b/patches/gromacs-4.6.5.diff/src/kernel/md.c.preplumed
@@ -181,7 +181,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     int               nchkpt  = 1;
     gmx_localtop_t   *top;
     t_mdebin         *mdebin = NULL;
-    df_history_t      df_history;
     t_state          *state    = NULL;
     rvec             *f_global = NULL;
     int               n_xtc    = -1;
@@ -330,14 +329,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         snew(f, top_global->natoms);
     }
 
-    /* lambda Monte carlo random number generator  */
-    if (ir->bExpanded)
-    {
-        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
-    }
-    /* copy the state into df_history */
-    copy_df_history(&df_history, &state_global->dfhist);
-
     /* Kinetic energy data */
     snew(ekind, 1);
     init_ekindata(fplog, top_global, &(ir->opts), ekind);
@@ -432,7 +423,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             make_local_shells(cr, mdatoms, shellfc);
         }
 
-        init_bonded_thread_force_reduction(fr, &top->idef);
+        setup_bonded_threading(fr, &top->idef);
 
         if (ir->pull && PAR(cr))
         {
@@ -462,6 +453,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         bStateFromCP = FALSE;
     }
 
+    if (ir->bExpanded)
+    {
+        init_expanded_ensemble(bStateFromCP,ir,&mcrng,&state->dfhist);
+    }
+
     if (MASTER(cr))
     {
         if (bStateFromCP)
@@ -858,7 +854,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
             bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
         }
 
         if (bSimAnn)
@@ -1415,7 +1412,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                statistics, but if performing simulated tempering, we
                do update the velocities and the tau_t. */
 
-            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, mcrng, state->v, mdatoms);
+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
+            copy_df_history(&state_global->dfhist,&state->dfhist);
         }
         /* ################## START TRAJECTORY OUTPUT ################# */
 
@@ -1494,14 +1493,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                         state_global->ekinstate.bUpToDate = TRUE;
                     }
                     update_energyhistory(&state_global->enerhist, mdebin);
-                    if (ir->efep != efepNO || ir->bSimTemp)
-                    {
-                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
-                                                                       structured so this isn't necessary.
-                                                                       Note this reassignment is only necessary
-                                                                       for single threads.*/
-                        copy_df_history(&state_global->dfhist, &df_history);
-                    }
                 }
             }
             write_traj(fplog, cr, outf, mdof_flags, top_global,
@@ -2004,7 +1995,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             {
                 /* only needed if doing expanded ensemble */
                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-                                          &df_history, state->fep_state, ir->nstlog, step);
+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
             }
             if (!(bStartingFromCpt && (EI_VV(ir->eI))))
             {
@@ -2043,13 +2034,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         }
         if (bDoExpanded)
         {
-            /* Have to do this part after outputting the logfile and the edr file */
+            /* Have to do this part _after_ outputting the logfile and the edr file */
+            /* Gets written into the state at the beginning of next loop*/
             state->fep_state = lamnew;
-            for (i = 0; i < efptNR; i++)
-            {
-                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
-            }
         }
+
         /* Remaining runtime */
         if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
         {
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c b/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c
similarity index 96%
rename from patches/gromacs-4.6.3.diff/src/kernel/mdrun.c
rename to patches/gromacs-4.6.5.diff/src/kernel/mdrun.c
index ba301007d08d9dcc0eeef4d656fb31b686e40b46..e64c399efec380dc86c23f7a2d4e997233b2ef98 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c
+++ b/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c
@@ -127,15 +127,20 @@ int cmain(int argc, char *argv[])
         "[PAR]",
         "With GPUs (only supported with the Verlet cut-off scheme), the number",
         "of GPUs should match the number of MPI processes or MPI threads,",
-        "excluding PME-only processes/threads. With thread-MPI the number",
+        "excluding PME-only processes/threads. With thread-MPI, unless set on the command line, the number",
         "of MPI threads will automatically be set to the number of GPUs detected.",
-        "When you want to use a subset of the available GPUs, you can use",
-        "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
-        "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
-        "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
-        "variable instead. The format for GMX_GPU_ID is identical to ",
-        "[TT]-gpu_id[tt], but an environment variable can have different values",
-        "on different nodes of a cluster.",
+        "To use a subset of the available GPUs, or to manually provide a mapping of",
+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
+        "respectively. To select different sets of GPU-s",
+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
+        "different values on different compute nodes. Multiple MPI ranks on each node",
+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
         "[PAR]",
         "When using PME with separate PME nodes or with a GPU, the two major",
         "compute tasks, the non-bonded force calculation and the PME calculation",
@@ -458,7 +463,12 @@ int cmain(int argc, char *argv[])
     output_env_t  oenv                  = NULL;
     const char   *deviceOptions         = "";
 
-    gmx_hw_opt_t  hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+     * But unfortunately we are not allowed to call a function here,
+     * since declarations follow below.
+     */
+    gmx_hw_opt_t  hw_opt = { 0, 0, 0, 0, threadaffSEL, 0, 0,
+                             { NULL, FALSE, 0, NULL } };
 
     t_pargs       pa[] = {
 
@@ -484,8 +494,8 @@ int cmain(int argc, char *argv[])
           "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
         { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
           "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
-          "List of GPU id's to use" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
         { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
           "Check for all bonded interactions with DD" },
         { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed b/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed
similarity index 95%
rename from patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed
rename to patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed
index a5c0e6a6256def6a24b6b0bb702c5eef114b58c5..2ccc6f67469c0aab5263ec48636818785073f661 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/mdrun.c.preplumed
+++ b/patches/gromacs-4.6.5.diff/src/kernel/mdrun.c.preplumed
@@ -121,15 +121,20 @@ int cmain(int argc, char *argv[])
         "[PAR]",
         "With GPUs (only supported with the Verlet cut-off scheme), the number",
         "of GPUs should match the number of MPI processes or MPI threads,",
-        "excluding PME-only processes/threads. With thread-MPI the number",
+        "excluding PME-only processes/threads. With thread-MPI, unless set on the command line, the number",
         "of MPI threads will automatically be set to the number of GPUs detected.",
-        "When you want to use a subset of the available GPUs, you can use",
-        "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
-        "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
-        "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
-        "variable instead. The format for GMX_GPU_ID is identical to ",
-        "[TT]-gpu_id[tt], but an environment variable can have different values",
-        "on different nodes of a cluster.",
+        "To use a subset of the available GPUs, or to manually provide a mapping of",
+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
+        "respectively. To select different sets of GPU-s",
+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
+        "different values on different compute nodes. Multiple MPI ranks on each node",
+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
         "[PAR]",
         "When using PME with separate PME nodes or with a GPU, the two major",
         "compute tasks, the non-bonded force calculation and the PME calculation",
@@ -451,7 +456,12 @@ int cmain(int argc, char *argv[])
     output_env_t  oenv                  = NULL;
     const char   *deviceOptions         = "";
 
-    gmx_hw_opt_t  hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+     * But unfortunately we are not allowed to call a function here,
+     * since declarations follow below.
+     */
+    gmx_hw_opt_t  hw_opt = { 0, 0, 0, 0, threadaffSEL, 0, 0,
+                             { NULL, FALSE, 0, NULL } };
 
     t_pargs       pa[] = {
 
@@ -477,8 +487,8 @@ int cmain(int argc, char *argv[])
           "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
         { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
           "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
-          "List of GPU id's to use" },
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
         { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
           "Check for all bonded interactions with DD" },
         { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c b/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c
similarity index 99%
rename from patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c
rename to patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c
index 2e79c67ad53769d9b8df84b99b66ca2bfa18de30..9fa6dc5cea8a4847f536dbe99c4e19fee97f75c8 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c
+++ b/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c
@@ -1422,6 +1422,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
                 pd_collect_state(cr, state);
             }
         }
+        else
+        {
+            copy_state_nonatomdata(state_local, state);
+        }
 
         if (MASTER(cr))
         {
diff --git a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed b/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed
similarity index 99%
rename from patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed
rename to patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed
index 60cd71848a783e2395493ef9bd3b75d3fdfd9a95..3bbb3f94ef4b0304e48cc62c7960ac130487625a 100644
--- a/patches/gromacs-4.6.3.diff/src/kernel/repl_ex.c.preplumed
+++ b/patches/gromacs-4.6.5.diff/src/kernel/repl_ex.c.preplumed
@@ -1343,6 +1343,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
                 pd_collect_state(cr, state);
             }
         }
+        else
+        {
+            copy_state_nonatomdata(state_local, state);
+        }
 
         if (MASTER(cr))
         {