From 5fc445addb65a44c4a30fc8883affc364435ae86 Mon Sep 17 00:00:00 2001
From: Gareth Tribello <gareth.tribello@gmail.com>
Date: Sat, 20 May 2017 16:02:33 +0100
Subject: [PATCH] Changed histogram so that it is calculated by default using
 Giovanni's formula for reweighted histogram

[makedocs]
---
 CHANGES/v2.4.txt                              |  2 +
 regtest/analysis/rt-histo/plumed.dat          |  4 +-
 .../analysis/rt-uweights-integral/plumed.dat  |  2 +-
 regtest/analysis/rt0/plumed.dat               |  9 ++--
 .../multicolvar/rt-dens-average/plumed.dat    |  4 +-
 src/analysis/Histogram.cpp                    | 44 ++++++++++++++++++-
 src/gridtools/ConvertToFES.cpp                |  2 +-
 src/vesselbase/ActionWithAveraging.cpp        | 14 ++++--
 src/vesselbase/ActionWithAveraging.h          |  4 +-
 src/vesselbase/AveragingVessel.cpp            |  2 +-
 10 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/CHANGES/v2.4.txt b/CHANGES/v2.4.txt
index b7d6566ec..d4a44eda0 100644
--- a/CHANGES/v2.4.txt
+++ b/CHANGES/v2.4.txt
@@ -14,6 +14,8 @@ Changes from version 2.3 which are relevant for users:
 - The meaning of BIASFACTOR=1 in \ref METAD has been modified and can now be used to indicate unbiased
   simulations. Non-well-tempered metadynamics is BIASFACTOR=-1, which is the new default value.
   Notice that this has an implication on the biasfactor written in the HILLS file when doing
+- \ref HISTOGRAM : When using weights default is now to output histogram divided by number of frames from which data was taken.  In addition the 
+  UNORMALIZED flag has been replaced with the keyword NORMALIZATION, which can be set equal to true, false or ndata.
   non-well-tempered metadynamics.
 - New modules:
   - A new EDS module have been included, contributed by Glen Hocky and Andrew White.
diff --git a/regtest/analysis/rt-histo/plumed.dat b/regtest/analysis/rt-histo/plumed.dat
index 8a99779ab..0c492f9bd 100755
--- a/regtest/analysis/rt-histo/plumed.dat
+++ b/regtest/analysis/rt-histo/plumed.dat
@@ -31,7 +31,7 @@ HISTOGRAM ...
   GRID_MAX=3.0
   GRID_BIN=100
   BANDWIDTH=0.1
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=hA1u
 ... HISTOGRAM
 
@@ -59,7 +59,7 @@ HISTOGRAM ...
   GRID_MAX=3.0
   GRID_BIN=100
   BANDWIDTH=0.1
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=fhA1u
 ... HISTOGRAM
 
diff --git a/regtest/analysis/rt-uweights-integral/plumed.dat b/regtest/analysis/rt-uweights-integral/plumed.dat
index 4dbd98516..67c0f23b3 100644
--- a/regtest/analysis/rt-uweights-integral/plumed.dat
+++ b/regtest/analysis/rt-uweights-integral/plumed.dat
@@ -1,7 +1,7 @@
 z1: ZANGLES GROUP=1-5 SWITCH={RATIONAL D_0=2.0 R_0=0.5 D_MAX=5.0}
 z2: ZXTORSIONS GROUP=1-5 SWITCH={RATIONAL D_0=2.0 R_0=0.5 D_MAX=5.0}
  
-h: HISTOGRAM DATA=z1,z2 GRID_BIN=20,20 GRID_MIN=0,-pi GRID_MAX=pi,pi BANDWIDTH=0.2,0.2 UNORMALIZED
+h: HISTOGRAM DATA=z1,z2 GRID_BIN=20,20 GRID_MIN=0,-pi GRID_MAX=pi,pi BANDWIDTH=0.2,0.2 NORMALIZATION=false
 iv: INTEGRATE_GRID GRID=h 
 
 b: RESTRAINT ARG=iv AT=0.75 KAPPA=10.0 
diff --git a/regtest/analysis/rt0/plumed.dat b/regtest/analysis/rt0/plumed.dat
index db0eeb44f..c16c4b900 100755
--- a/regtest/analysis/rt0/plumed.dat
+++ b/regtest/analysis/rt0/plumed.dat
@@ -22,6 +22,7 @@ HISTOGRAM ...
   GRID_BIN=100
   BANDWIDTH=0.1
   LOGWEIGHTS=bias 
+  NORMALIZATION=true
   LABEL=hB
 ... HISTOGRAM
 
@@ -45,7 +46,7 @@ HISTOGRAM ...
   GRID_BIN=100
   KERNEL=DISCRETE
   LOGWEIGHTS=bias
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=hD
 ... HISTOGRAM
 
@@ -60,7 +61,7 @@ HISTOGRAM ...
   GRID_BIN=100
   KERNEL=DISCRETE
   LOGWEIGHTS=bias-ht
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=hE
 ... HISTOGRAM
 
@@ -76,7 +77,7 @@ HISTOGRAM ...
   LOGWEIGHTS=bias
   UPDATE_FROM=0
   UPDATE_UNTIL=1
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=hD1
 ... HISTOGRAM
 
@@ -91,7 +92,7 @@ HISTOGRAM ...
   LOGWEIGHTS=bias
   UPDATE_FROM=1
   UPDATE_UNTIL=2
-  UNORMALIZED
+  NORMALIZATION=false
   LABEL=hD2
 ... HISTOGRAM
 
diff --git a/regtest/multicolvar/rt-dens-average/plumed.dat b/regtest/multicolvar/rt-dens-average/plumed.dat
index 30bba5a3c..873318024 100644
--- a/regtest/multicolvar/rt-dens-average/plumed.dat
+++ b/regtest/multicolvar/rt-dens-average/plumed.dat
@@ -3,7 +3,7 @@ UNITS NATURAL
 dens: COORDINATIONNUMBER SPECIESA=2 SPECIESB=3-5 SWITCH={RATIONAL D_0=1.1 R_0=0.001 D_MAX=2.0}  
 
 # Print the average (whole trajectory) density with a stride of one 
-dens1: MULTICOLVARDENS DATA=dens ORIGIN=1 DIR=x NBINS=50 UNORMALIZED BANDWIDTH=0.2 
+dens1: MULTICOLVARDENS DATA=dens ORIGIN=1 DIR=x NBINS=50 NORMALIZATION=false BANDWIDTH=0.2 
 DUMPGRID GRID=dens1 STRIDE=1 FILE=dens1 FMT=%8.4f
 # Print the average density with a stride of two 
 DUMPGRID GRID=dens1 STRIDE=2 FILE=dens2 FMT=%8.4f
@@ -15,7 +15,7 @@ dens1a: MULTICOLVARDENS DATA=dens ORIGIN=1 DIR=x NBINS=50 BANDWIDTH=0.2
 DUMPGRID GRID=dens1a FILE=dens5a FMT=%8.4f
 
 # Print unormalized averages (over two frames) of the density
-dens1b: MULTICOLVARDENS DATA=dens ORIGIN=1 DIR=x NBINS=50 CLEAR=1 UNORMALIZED BANDWIDTH=0.2
+dens1b: MULTICOLVARDENS DATA=dens ORIGIN=1 DIR=x NBINS=50 CLEAR=1 NORMALIZATION=false BANDWIDTH=0.2
 DUMPGRID GRID=dens1b STRIDE=1 FILE=dens1b FMT=%8.4f
 
 # Print block averages (over two frames) of the density
diff --git a/src/analysis/Histogram.cpp b/src/analysis/Histogram.cpp
index 58e981598..fdbb41fb5 100644
--- a/src/analysis/Histogram.cpp
+++ b/src/analysis/Histogram.cpp
@@ -78,7 +78,46 @@ are replaced by dirac delta functions.   When this method is used the final func
 density - it is instead a probability mass function as each element of the function tells you the value of an integral
 between two points on your grid rather than the value of a (continuous) function on a grid.
 
-Additional material and examples can be also found in the tutorial \ref belfast-1.
+Additional material and examples can be also found in the tutorials \ref belfast-1 and \ref lugano-1.
+
+\par A note on block averaging and errors
+
+Some particularly important
+issues related to the convergence of histograms and the estimation of error bars around the ensemble averages you calculate are covered in \ref trieste-2.  
+The technique for estimating error bars that is known as block averaging is introduced in this tutorial.  The essence of this technique is that
+the trajectory is split into a set of blocks and separate ensemble averages are calculated from each separate block of data.  If \f$\{A_i\}\f$ is 
+the set of \f$N\f$ block averages that are obtained from this technique then the final error bar is calculated as:
+
+\f[
+\textrm{error} = \sqrt{ \frac{1}{N} \frac{1}{N-1} \sum_{i=1}^N (A_i^2 - \langle A \rangle )^2 } \qquad \textrm{where} \qquad \langle A \rangle = \frac{1}{N} \sum_{i=1}^N A_i
+\f] 
+
+If the simulation is biased and reweighting is performed then life is a little more complex as each of the block averages should be calculated as a 
+weighted average.  Furthermore, the weights should be taken into account when the final ensemble and error bars are calculated.  As such the error should be:
+
+\f[
+\textrm{error} = \sqrt{ \frac{1}{N} \frac{\sum_{i=1}^N W_i }{\sum_{i=1}^N W_i - \sum_{i=1}^N W_i^2 / \sum_{i=1}^N W_i} \sum_{i=1}^N W_i (A_i^2 - \langle A \rangle )^2 }
+\f]
+
+where \f$W_i\f$ is the sum of all the weights for the \f$i\f$th block of data.  
+
+If we wish to caclulate a normalized histogram we must calculate ensemble averages from our biased simulation using:
+\f[
+ \langle H(x) \rangle = \frac{\sum_{t=1}^M w_t K( x - x_t,\sigma) }{\sum_{t=1}^M w_t}
+\f]
+where the sums runs over the trajectory, \f$w_t\f$ is the weight of the \f$t\f$th trajectory frame, \f$x_t\f$ is the value of the cv for the \f$t\f$th
+trajectory frame and \f$K\f$ is a kernel function centered on \f$x_t\f$ with bandwidth \f$\sigma\f$.  The quantity that is evaluated is the value of the 
+normalized histogram at point \f$x\f$.  The following ensemble average will be calculated if you use the NORMALIZATION=true option in HISTOGRAM.
+If the ensemble average is calculated in this way we must calculate the associated error bars from our block averages using the second of the expressions
+above.
+
+A number of works have shown that when biased simulations are performed it is often better to calculate an unormalized estimate of the histogram using:
+\f[
+\langle H(x) \rangle = \frac{1}{M} \sum_{t=1}^M w_t K( x - x_t,\sigma)
+\f]
+instead of the expression above.  As such this is what is done by default in HISTOGRAM or if the NORMALIZATION=ndata option is used.  
+When the histogram is calculated in this second way the first of the two formula above can be used when calculating error bars from
+block averages.
 
 \par Examples
 
@@ -184,7 +223,8 @@ public:
 PLUMED_REGISTER_ACTION(Histogram,"HISTOGRAM")
 
 void Histogram::registerKeywords( Keywords& keys ) {
-  gridtools::ActionWithGrid::registerKeywords( keys ); keys.use("ARG");
+  gridtools::ActionWithGrid::registerKeywords( keys ); keys.use("ARG"); keys.remove("NORMALIZATION");
+  keys.add("compulsory","NORMALIZATION","ndata","This controls how the data is normalized it can be set equal to true, false or ndata.  See above for an explanation");
   keys.add("optional","DATA","input data from action with vessel and compute histogram");
   keys.add("optional","VECTORS","input three dimsnional vectors for computing histogram");
   keys.add("compulsory","GRID_MIN","the lower bounds for the grid");
diff --git a/src/gridtools/ConvertToFES.cpp b/src/gridtools/ConvertToFES.cpp
index e28904b74..4e77f435b 100644
--- a/src/gridtools/ConvertToFES.cpp
+++ b/src/gridtools/ConvertToFES.cpp
@@ -81,7 +81,7 @@ void ConvertToFES::registerKeywords( Keywords& keys ) {
   ActionWithInputGrid::registerKeywords( keys );
   keys.add("optional","TEMP","the temperature at which you are operating");
   keys.remove("STRIDE"); keys.remove("KERNEL"); keys.remove("BANDWIDTH");
-  keys.remove("LOGWEIGHTS"); keys.remove("CLEAR"); keys.remove("UNORMALIZED");
+  keys.remove("LOGWEIGHTS"); keys.remove("CLEAR"); keys.remove("NORMALIZATION");
 }
 
 ConvertToFES::ConvertToFES(const ActionOptions&ao):
diff --git a/src/vesselbase/ActionWithAveraging.cpp b/src/vesselbase/ActionWithAveraging.cpp
index a114cf7b1..185eb6294 100644
--- a/src/vesselbase/ActionWithAveraging.cpp
+++ b/src/vesselbase/ActionWithAveraging.cpp
@@ -33,7 +33,8 @@ void ActionWithAveraging::registerKeywords( Keywords& keys ) {
   keys.add("compulsory","CLEAR","0","the frequency with which to clear all the accumulated data.  The default value "
            "of 0 implies that all the data will be used and that the grid will never be cleared");
   keys.add("optional","LOGWEIGHTS","list of actions that calculates log weights that should be used to weight configurations when calculating averages");
-  keys.addFlag("UNORMALIZED",false,"output the unaveraged quantity/quantities."); keys.remove("NUMERICAL_DERIVATIVES");
+  keys.add("compulsory","NORMALIZATION","true","This controls how the data is normalized it can be set equal to true, false or ndata.  The differences between these options are explained in the manual page for \\ref HISTOGRAM");
+  keys.remove("NUMERICAL_DERIVATIVES");
 }
 
 ActionWithAveraging::ActionWithAveraging( const ActionOptions& ao ):
@@ -70,7 +71,13 @@ ActionWithAveraging::ActionWithAveraging( const ActionOptions& ao ):
     else log.printf("  weights are all equal to one\n");
     requestArguments( arg );
   }
-  if( keywords.exists("UNORMALIZED") ) parseFlag("UNORMALIZED",unormalised);
+  if( keywords.exists("NORMALIZATION") ){
+      std::string normstr; parse("NORMALIZATION",normstr);
+      if( normstr=="true" ) normalization=t;
+      else if( normstr=="false" ) normalization=f;
+      else if( normstr=="ndata" ) normalization=ndata;
+      else error("invalid instruction for NORMALIZATION flag should be true, false, or ndata");
+  }
 }
 
 void ActionWithAveraging::setAveragingAction( AveragingVessel* av_vessel, const bool& usetasks ) {
@@ -112,7 +119,8 @@ void ActionWithAveraging::update() {
   // This the averaging if it is not done using task list
   else performOperations( true );
   // Update the norm
-  if( myaverage ) myaverage->setNorm( cweight + myaverage->getNorm() );
+  double normt = cweight; if( normalization==ndata ) normt = 1;
+  if( myaverage ) myaverage->setNorm( normt + myaverage->getNorm() );
   // Finish the averaging
   finishAveraging();
   // By resetting here we are ensuring that the grid will be cleared at the start of the next step
diff --git a/src/vesselbase/ActionWithAveraging.h b/src/vesselbase/ActionWithAveraging.h
index f9a7bbe3d..eaefa251b 100644
--- a/src/vesselbase/ActionWithAveraging.h
+++ b/src/vesselbase/ActionWithAveraging.h
@@ -56,7 +56,7 @@ private:
 /// The weights we are going to use for reweighting
   std::vector<Value*> weights;
 /// Are we accumulated the unormalized quantity
-  bool unormalised;
+  enum {t,f,ndata} normalization;
 protected:
 /// This ensures runAllTasks is used
   bool useRunAllTasks;
@@ -103,7 +103,7 @@ std::vector<Value*> ActionWithAveraging::getArguments() {
 
 inline
 bool ActionWithAveraging::noNormalization() const {
-  return unormalised;
+  return normalization==f;
 }
 
 }
diff --git a/src/vesselbase/AveragingVessel.cpp b/src/vesselbase/AveragingVessel.cpp
index f44d22b24..93d364d2e 100644
--- a/src/vesselbase/AveragingVessel.cpp
+++ b/src/vesselbase/AveragingVessel.cpp
@@ -34,7 +34,7 @@ AveragingVessel::AveragingVessel( const vesselbase::VesselOptions& vo ):
   wascleared(true)
 {
   ActionWithAveraging* myav = dynamic_cast<ActionWithAveraging*>( getAction() );
-  plumed_assert( myav ); unormalised = myav->unormalised;
+  plumed_assert( myav ); unormalised = myav->noNormalization();
 }
 
 void AveragingVessel::finish( const std::vector<double>& buffer ) {
-- 
GitLab