diff --git a/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt
index 272bb55bb1cdcbb803523956d03356e5eec63fad..cc97aa80501c8b4ffcf508a7f9fb4df40033131f 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt
+++ b/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt
@@ -61,11 +61,6 @@ if(GMX_USE_TNG)
         include_directories(${TNG_IO_INCLUDE_DIRS})
     endif()
     if(NOT GMX_EXTERNAL_TNG)
-        # TNG wants zlib if it is available
-        find_package(ZLIB QUIET)
-        include(gmxTestZLib)
-        gmx_test_zlib(HAVE_ZLIB)
-
         include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
         tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
         list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
@@ -73,11 +68,12 @@ if(GMX_USE_TNG)
 
         if (HAVE_ZLIB)
             list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
+            include_directories(${ZLIB_INCLUDE_DIRS})
         endif()
     endif()
 else()
     # We still need to get tng/tng_io_fwd.h from somewhere!
-    include_directories(${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
 endif()
 
 add_subdirectory(gmxlib)
@@ -205,7 +201,11 @@ set_target_properties(libgromacs PROPERTIES
 # Only install the library in mdrun-only mode if it is actually necessary
 # for the binary
 if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-    install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+    install(TARGETS libgromacs
+        LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+        RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+        ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+        COMPONENT libraries)
 endif()
 
 if (NOT GMX_BUILD_MDRUN_ONLY)
diff --git a/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt.preplumed
index 2045cebcc6f06c8be921c8d9589e6701d6e6f004..6db37e2402e57b5ead038c6f4fe53bdebb24c2f1 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt.preplumed
+++ b/patches/gromacs-5.0.diff/src/gromacs/CMakeLists.txt.preplumed
@@ -59,11 +59,6 @@ if(GMX_USE_TNG)
         include_directories(${TNG_IO_INCLUDE_DIRS})
     endif()
     if(NOT GMX_EXTERNAL_TNG)
-        # TNG wants zlib if it is available
-        find_package(ZLIB QUIET)
-        include(gmxTestZLib)
-        gmx_test_zlib(HAVE_ZLIB)
-
         include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
         tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
         list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
@@ -71,11 +66,12 @@ if(GMX_USE_TNG)
 
         if (HAVE_ZLIB)
             list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
+            include_directories(${ZLIB_INCLUDE_DIRS})
         endif()
     endif()
 else()
     # We still need to get tng/tng_io_fwd.h from somewhere!
-    include_directories(${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
 endif()
 
 add_subdirectory(gmxlib)
@@ -203,7 +199,11 @@ set_target_properties(libgromacs PROPERTIES
 # Only install the library in mdrun-only mode if it is actually necessary
 # for the binary
 if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-    install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+    install(TARGETS libgromacs
+        LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+        RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+        ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+        COMPONENT libraries)
 endif()
 
 if (NOT GMX_BUILD_MDRUN_ONLY)
diff --git a/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c b/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c
index f4de01a74c8ac75b2efd99969a1be24cc204afa3..8227d5b18f07945a30fcb949b6047f16d9ff0efc 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c
+++ b/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c
@@ -125,9 +125,11 @@ static void reduce_thread_forces(int n, rvec *f,
                                  int nthreads, f_thread_t *f_t)
 {
     int t, i;
+    int nthreads_loop gmx_unused;
 
     /* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
     for (i = 0; i < n; i++)
     {
         for (t = 1; t < nthreads; t++)
@@ -547,14 +549,11 @@ void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
 
                     ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
                                        cr, t, fr,
-                                       md->chargeA,
-                                       md->nChargePerturbed ? md->chargeB : NULL,
-                                       md->sqrt_c6A,
-                                       md->nTypePerturbed ? md->sqrt_c6B : NULL,
-                                       md->sigmaA,
-                                       md->nTypePerturbed ? md->sigmaB : NULL,
-                                       md->sigma3A,
-                                       md->nTypePerturbed ? md->sigma3B : NULL,
+                                       md->chargeA, md->chargeB,
+                                       md->sqrt_c6A, md->sqrt_c6B,
+                                       md->sigmaA, md->sigmaB,
+                                       md->sigma3A, md->sigma3B,
+                                       md->nChargePerturbed || md->nTypePerturbed,
                                        ir->cutoff_scheme != ecutsVERLET,
                                        excl, x, bSB ? boxs : box, mu_tot,
                                        ir->ewald_geometry,
diff --git a/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c.preplumed b/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c.preplumed
index 632c2f3a4664ffb00ae72262bcdcf0c6e0dd905a..5230983cbe17802a374739a500948b343d4af8ba 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c.preplumed
+++ b/patches/gromacs-5.0.diff/src/gromacs/mdlib/force.c.preplumed
@@ -117,9 +117,11 @@ static void reduce_thread_forces(int n, rvec *f,
                                  int nthreads, f_thread_t *f_t)
 {
     int t, i;
+    int nthreads_loop gmx_unused;
 
     /* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
     for (i = 0; i < n; i++)
     {
         for (t = 1; t < nthreads; t++)
@@ -539,14 +541,11 @@ void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
 
                     ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
                                        cr, t, fr,
-                                       md->chargeA,
-                                       md->nChargePerturbed ? md->chargeB : NULL,
-                                       md->sqrt_c6A,
-                                       md->nTypePerturbed ? md->sqrt_c6B : NULL,
-                                       md->sigmaA,
-                                       md->nTypePerturbed ? md->sigmaB : NULL,
-                                       md->sigma3A,
-                                       md->nTypePerturbed ? md->sigma3B : NULL,
+                                       md->chargeA, md->chargeB,
+                                       md->sqrt_c6A, md->sqrt_c6B,
+                                       md->sigmaA, md->sigmaB,
+                                       md->sigma3A, md->sigma3B,
+                                       md->nChargePerturbed || md->nTypePerturbed,
                                        ir->cutoff_scheme != ecutsVERLET,
                                        excl, x, bSB ? boxs : box, mu_tot,
                                        ir->ewald_geometry,
diff --git a/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c b/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c
index 5b29bd7017b0c64e15a06db83d7522736770d75f..1c1d0b00b65950f501f59544af8de53dc91c85a8 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c
+++ b/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c
@@ -317,7 +317,8 @@ void init_em(FILE *fplog, const char *title,
              gmx_vsite_t *vsite, gmx_constr_t constr,
              int nfile, const t_filenm fnm[],
              gmx_mdoutf_t *outf, t_mdebin **mdebin,
-             int imdport, unsigned long gmx_unused Flags)
+             int imdport, unsigned long gmx_unused Flags,
+             gmx_wallcycle_t wcycle)
 {
     int  i;
     real dvdl_constr;
@@ -434,7 +435,7 @@ void init_em(FILE *fplog, const char *title,
         *gstat = global_stat_init(ir);
     }
 
-    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL);
+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
 
     snew(*enerd, 1);
     init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
@@ -593,6 +594,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     int      start, end;
     rvec    *x1, *x2;
     real     dvdl_constr;
+    int      nthreads gmx_unused;
 
     s1 = &ems1->s;
     s2 = &ems2->s;
@@ -630,7 +632,8 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     x1 = s1->x;
     x2 = s2->x;
 
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
     {
         int gf, i, m;
 
@@ -1072,7 +1075,7 @@ double do_cg(FILE *fplog, t_commrec *cr,
     init_em(fplog, CG, cr, inputrec,
             state_global, top_global, s_min, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
 
     /* Print to log file */
     print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
@@ -1744,7 +1747,7 @@ double do_lbfgs(FILE *fplog, t_commrec *cr,
     init_em(fplog, LBFGS, cr, inputrec,
             state, top_global, &ems, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
     /* Do_lbfgs is not completely updated like do_steep and do_cg,
      * so we free some memory again.
      */
@@ -2489,7 +2492,7 @@ double do_steep(FILE *fplog, t_commrec *cr,
     init_em(fplog, SD, cr, inputrec,
             state_global, top_global, s_try, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
 
     /* Print to log file  */
     print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
@@ -2741,7 +2744,7 @@ double do_nm(FILE *fplog, t_commrec *cr,
             state_global, top_global, state_work, &top,
             &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, NULL, imdport, Flags);
+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
 
     natoms = top_global->natoms;
     snew(fneg, natoms);
diff --git a/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c.preplumed b/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c.preplumed
index ab48e44b42c6ad7274b2108f77e1a2b36dfcc733..69008f53fabcb3388951c01d7554d479c67ccdca 100644
--- a/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c.preplumed
+++ b/patches/gromacs-5.0.diff/src/gromacs/mdlib/minimize.c.preplumed
@@ -310,7 +310,8 @@ void init_em(FILE *fplog, const char *title,
              gmx_vsite_t *vsite, gmx_constr_t constr,
              int nfile, const t_filenm fnm[],
              gmx_mdoutf_t *outf, t_mdebin **mdebin,
-             int imdport, unsigned long gmx_unused Flags)
+             int imdport, unsigned long gmx_unused Flags,
+             gmx_wallcycle_t wcycle)
 {
     int  i;
     real dvdl_constr;
@@ -427,7 +428,7 @@ void init_em(FILE *fplog, const char *title,
         *gstat = global_stat_init(ir);
     }
 
-    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL);
+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
 
     snew(*enerd, 1);
     init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
@@ -549,6 +550,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     int      start, end;
     rvec    *x1, *x2;
     real     dvdl_constr;
+    int      nthreads gmx_unused;
 
     s1 = &ems1->s;
     s2 = &ems2->s;
@@ -586,7 +588,8 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     x1 = s1->x;
     x2 = s2->x;
 
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
     {
         int gf, i, m;
 
@@ -1000,7 +1003,7 @@ double do_cg(FILE *fplog, t_commrec *cr,
     init_em(fplog, CG, cr, inputrec,
             state_global, top_global, s_min, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
 
     /* Print to log file */
     print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
@@ -1672,7 +1675,7 @@ double do_lbfgs(FILE *fplog, t_commrec *cr,
     init_em(fplog, LBFGS, cr, inputrec,
             state, top_global, &ems, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
     /* Do_lbfgs is not completely updated like do_steep and do_cg,
      * so we free some memory again.
      */
@@ -2417,7 +2420,7 @@ double do_steep(FILE *fplog, t_commrec *cr,
     init_em(fplog, SD, cr, inputrec,
             state_global, top_global, s_try, &top, &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, &mdebin, imdport, Flags);
+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
 
     /* Print to log file  */
     print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
@@ -2669,7 +2672,7 @@ double do_nm(FILE *fplog, t_commrec *cr,
             state_global, top_global, state_work, &top,
             &f, &f_global,
             nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-            nfile, fnm, &outf, NULL, imdport, Flags);
+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
 
     natoms = top_global->natoms;
     snew(fneg, natoms);
diff --git a/patches/gromacs-5.0.diff/src/programs/mdrun/md.c b/patches/gromacs-5.0.diff/src/programs/mdrun/md.c
index 8b54282d6b65b83e79855ab5a90439e5c8c5b91d..481aec4c657a901d6a30b51a60c34a8ed4d143af 100644
--- a/patches/gromacs-5.0.diff/src/programs/mdrun/md.c
+++ b/patches/gromacs-5.0.diff/src/programs/mdrun/md.c
@@ -309,7 +309,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             &(state_global->fep_state), lam0,
             nrnb, top_global, &upd,
             nfile, fnm, &outf, &mdebin,
-            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags);
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
 
     clear_mat(total_vir);
     clear_mat(pres);
@@ -1176,6 +1176,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         if (bVV && !bStartingFromCpt && !bRerunMD)
         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
         {
+            wallcycle_start(wcycle, ewcUPDATE);
             if (ir->eI == eiVV && bInitStep)
             {
                 /* if using velocity verlet with full time step Ekin,
@@ -1250,11 +1251,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 bOK = TRUE;
                 if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
                 {
+                    wallcycle_stop(wcycle, ewcUPDATE);
                     update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, shake_vir,
                                        cr, nrnb, wcycle, upd, constr,
                                        TRUE, bCalcVir, vetanew);
+                    wallcycle_start(wcycle, ewcUPDATE);
 
                     if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
                     {
@@ -1294,6 +1297,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                    So we need information from the last step in the first half of the integration */
                 if (bGStat || do_per_step(step-1, nstglobalcomm))
                 {
+                    wallcycle_stop(wcycle, ewcUPDATE);
                     compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
                                     wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
                                     constr, NULL, FALSE, state->box,
@@ -1314,6 +1318,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                        time step kinetic energy for the pressure (always true now, since we want accurate statistics).
                        b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
                        EkinAveVel because it's needed for the pressure */
+                    wallcycle_start(wcycle, ewcUPDATE);
                 }
                 /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
                 if (!bInitStep)
@@ -1327,7 +1332,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         if (bExchanged)
                         {
-
+                            wallcycle_stop(wcycle, ewcUPDATE);
                             /* We need the kinetic energy at minus the half step for determining
                              * the full step kinetic energy and possibly for T-coupling.*/
                             /* This may not be quite working correctly yet . . . . */
@@ -1336,6 +1341,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                             constr, NULL, FALSE, state->box,
                                             top_global, &bSumEkinhOld,
                                             CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                            wallcycle_start(wcycle, ewcUPDATE);
                         }
                     }
                 }
@@ -1365,6 +1371,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             {
                 copy_rvecn(cbuf, state->v, 0, state->natoms);
             }
+            wallcycle_stop(wcycle, ewcUPDATE);
         }
 
         /* MRS -- now done iterating -- compute the conserved quantity */
@@ -1407,7 +1414,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
                                  ir, state, state_global, top_global, fr,
                                  outf, mdebin, ekind, f, f_global,
-                                 wcycle, &nchkpt,
+                                 &nchkpt,
                                  bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
                                  bSumEkinhOld);
         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
@@ -1989,6 +1996,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     }
                     dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 
+                    if (bPMETuneRunning &&
+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                     if (bPMETuneRunning || step_rel > ir->nstlist*50)
                     {
                         bPMETuneTry     = FALSE;
@@ -2019,6 +2041,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         calc_enervirdiff(NULL, ir->eDispCorr, fr);
                     }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                 }
                 cycles_pmes = 0;
             }
@@ -2049,6 +2081,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* End of main MD loop */
     debug_gmx();
 
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
     /* Stop measuring walltime */
     walltime_accounting_end(walltime_accounting);
 
diff --git a/patches/gromacs-5.0.diff/src/programs/mdrun/md.c.preplumed b/patches/gromacs-5.0.diff/src/programs/mdrun/md.c.preplumed
index 5fb9b70118317957d50ee5ad803b94dd42aab242..3d98d597c7d0e9c009e02d816e631b3f36846a25 100644
--- a/patches/gromacs-5.0.diff/src/programs/mdrun/md.c.preplumed
+++ b/patches/gromacs-5.0.diff/src/programs/mdrun/md.c.preplumed
@@ -298,7 +298,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             &(state_global->fep_state), lam0,
             nrnb, top_global, &upd,
             nfile, fnm, &outf, &mdebin,
-            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags);
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
 
     clear_mat(total_vir);
     clear_mat(pres);
@@ -1089,6 +1089,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         if (bVV && !bStartingFromCpt && !bRerunMD)
         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
         {
+            wallcycle_start(wcycle, ewcUPDATE);
             if (ir->eI == eiVV && bInitStep)
             {
                 /* if using velocity verlet with full time step Ekin,
@@ -1163,11 +1164,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 bOK = TRUE;
                 if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
                 {
+                    wallcycle_stop(wcycle, ewcUPDATE);
                     update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, shake_vir,
                                        cr, nrnb, wcycle, upd, constr,
                                        TRUE, bCalcVir, vetanew);
+                    wallcycle_start(wcycle, ewcUPDATE);
 
                     if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
                     {
@@ -1207,6 +1210,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                    So we need information from the last step in the first half of the integration */
                 if (bGStat || do_per_step(step-1, nstglobalcomm))
                 {
+                    wallcycle_stop(wcycle, ewcUPDATE);
                     compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
                                     wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
                                     constr, NULL, FALSE, state->box,
@@ -1227,6 +1231,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                        time step kinetic energy for the pressure (always true now, since we want accurate statistics).
                        b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
                        EkinAveVel because it's needed for the pressure */
+                    wallcycle_start(wcycle, ewcUPDATE);
                 }
                 /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
                 if (!bInitStep)
@@ -1240,7 +1245,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         if (bExchanged)
                         {
-
+                            wallcycle_stop(wcycle, ewcUPDATE);
                             /* We need the kinetic energy at minus the half step for determining
                              * the full step kinetic energy and possibly for T-coupling.*/
                             /* This may not be quite working correctly yet . . . . */
@@ -1249,6 +1254,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                             constr, NULL, FALSE, state->box,
                                             top_global, &bSumEkinhOld,
                                             CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                            wallcycle_start(wcycle, ewcUPDATE);
                         }
                     }
                 }
@@ -1278,6 +1284,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             {
                 copy_rvecn(cbuf, state->v, 0, state->natoms);
             }
+            wallcycle_stop(wcycle, ewcUPDATE);
         }
 
         /* MRS -- now done iterating -- compute the conserved quantity */
@@ -1320,7 +1327,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
                                  ir, state, state_global, top_global, fr,
                                  outf, mdebin, ekind, f, f_global,
-                                 wcycle, &nchkpt,
+                                 &nchkpt,
                                  bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
                                  bSumEkinhOld);
         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
@@ -1902,6 +1909,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     }
                     dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 
+                    if (bPMETuneRunning &&
+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                     if (bPMETuneRunning || step_rel > ir->nstlist*50)
                     {
                         bPMETuneTry     = FALSE;
@@ -1932,6 +1954,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         calc_enervirdiff(NULL, ir->eDispCorr, fr);
                     }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                 }
                 cycles_pmes = 0;
             }
@@ -1962,6 +1994,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* End of main MD loop */
     debug_gmx();
 
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
     /* Stop measuring walltime */
     walltime_accounting_end(walltime_accounting);