From 5ac98d18f9e4a30a7b06909b0c884d2db32eb3f1 Mon Sep 17 00:00:00 2001 From: Giovanni Bussi <giovanni.bussi@gmail.com> Date: Fri, 14 Feb 2014 16:31:17 +0100 Subject: [PATCH] Further optimization with openmp Now also looks for env vars: PLUMED_NUM_THREADS PLUMED_CACHELINE_SIZE (cherry picked from commit 9c8bb507446521ff03b4e6581a99c6d183a2ab54) Conflicts: src/core/Atoms.cpp --- src/colvar/CoordinationBase.cpp | 22 ++++++-- src/core/Atoms.cpp | 56 +++++++++++++++----- src/core/Atoms.h | 3 ++ src/core/MDAtoms.cpp | 86 +++++++++++++++++++++++------- src/core/MDAtoms.h | 4 ++ src/core/PlumedMain.cpp | 3 ++ src/core/PlumedMain.h | 11 ++++ src/tools/OpenMP.cpp | 24 +++++++++ src/tools/OpenMP.h | 93 +++++++++++++++++++++++++++++++++ 9 files changed, 265 insertions(+), 37 deletions(-) create mode 100644 src/tools/OpenMP.cpp create mode 100644 src/tools/OpenMP.h diff --git a/src/colvar/CoordinationBase.cpp b/src/colvar/CoordinationBase.cpp index 50dc36eef..a963b5102 100644 --- a/src/colvar/CoordinationBase.cpp +++ b/src/colvar/CoordinationBase.cpp @@ -22,6 +22,7 @@ #include "CoordinationBase.h" #include "tools/NeighborList.h" #include "tools/Communicator.h" +#include "tools/OpenMP.h" #include <string> @@ -149,7 +150,14 @@ void CoordinationBase::calculate() rank=comm.Get_rank(); } - for(unsigned int i=rank;i<nl->size();i+=stride) { // sum over close pairs +#pragma omp parallel num_threads(OpenMP::getNumThreads()) +{ + const unsigned nn=nl->size(); + std::vector<Vector> omp_deriv(getPositions().size()); + Tensor omp_virial; + +#pragma omp for reduction(+:ncoord) nowait + for(unsigned int i=rank;i<nn;i+=stride) { Vector distance; unsigned i0=nl->getClosePair(i).first; @@ -166,10 +174,16 @@ void CoordinationBase::calculate() double dfunc=0.; ncoord += pairing(distance.modulo2(), dfunc,i0,i1); - deriv[i0] = deriv[i0] + (-dfunc)*distance ; - deriv[i1] = deriv[i1] + dfunc*distance ; - virial=virial+(-dfunc)*Tensor(distance,distance); + omp_deriv[i0] = omp_deriv[i0] + (-dfunc)*distance ; + omp_deriv[i1] = omp_deriv[i1] + dfunc*distance ; + omp_virial=omp_virial+(-dfunc)*Tensor(distance,distance); } +#pragma omp critical + { + for(int i=0;i<getPositions().size();i++) deriv[i]+=omp_deriv[i]; + virial+=omp_virial; +} +} if(!serial){ comm.Sum(ncoord); diff --git a/src/core/Atoms.cpp b/src/core/Atoms.cpp index baafae52d..88b395da4 100644 --- a/src/core/Atoms.cpp +++ b/src/core/Atoms.cpp @@ -23,6 +23,7 @@ #include "ActionAtomistic.h" #include "MDAtoms.h" #include "PlumedMain.h" +#include "tools/OpenMP.h" #include "tools/Pbc.h" #include <algorithm> #include <iostream> @@ -52,6 +53,8 @@ Atoms::Atoms(PlumedMain&plumed): forcesHaveBeenSet(0), virialHasBeenSet(false), massAndChargeOK(false), + asyncSent(false), + atomsNeeded(false), plumed(plumed), naturalUnits(false), timestep(0.0), @@ -141,10 +144,11 @@ void Atoms::share(){ shareAll(); return; } - if(dd && int(gatindex.size())<natoms){ - for(unsigned i=0;i<actions.size();i++) if(actions[i]->isActive()) { + for(unsigned i=0;i<actions.size();i++) if(actions[i]->isActive()) { + if(dd && int(gatindex.size())<natoms){ unique.insert(actions[i]->getUnique().begin(),actions[i]->getUnique().end()); } + if(!actions[i]->getUnique().empty()) atomsNeeded=true; } share(unique); } @@ -153,13 +157,36 @@ void Atoms::shareAll(){ std::set<AtomNumber> unique; if(dd && int(gatindex.size())<natoms) for(int i=0;i<natoms;i++) unique.insert(AtomNumber::index(i)); + atomsNeeded=true; share(unique); } void Atoms::share(const std::set<AtomNumber>& unique){ plumed_assert( positionsHaveBeenSet==3 && massesHaveBeenSet ); + virial.zero(); + if(int(gatindex.size())==natoms){ +#pragma omp parallel for num_threads(OpenMP::getGoodNumThreads(forces)) + for(unsigned i=0;i<natoms;i++) forces[i].zero(); + } else { + for(unsigned i=0;i<gatindex.size();i++) forces[gatindex[i]].zero(); + } + for(unsigned i=getNatoms();i<positions.size();i++) forces[i].zero(); // virtual atoms + forceOnEnergy=0.0; mdatoms->getBox(box); - mdatoms->getPositions(gatindex,positions); + + if(!atomsNeeded) return; + + atomsNeeded=false; + + mdatoms->getMasses(gatindex,masses); + mdatoms->getCharges(gatindex,charges); + if(int(gatindex.size())==natoms){ +// faster version, which retrieves all atoms + mdatoms->getPositions(0,natoms,positions); + } else { +// version that picks only atoms that are available on this proc + mdatoms->getPositions(gatindex,positions); + } // how many double per atom should be scattered: int ndata=3; if(!massAndChargeOK){ @@ -167,6 +194,7 @@ void Atoms::share(const std::set<AtomNumber>& unique){ mdatoms->getCharges(gatindex,charges); mdatoms->getMasses(gatindex,masses); } + if(dd && int(gatindex.size())<natoms){ if(dd.async){ for(unsigned i=0;i<dd.mpi_request_positions.size();i++) dd.mpi_request_positions[i].wait(); @@ -187,6 +215,7 @@ void Atoms::share(const std::set<AtomNumber>& unique){ } } if(dd.async){ + asyncSent=true; dd.mpi_request_positions.resize(dd.Get_size()); dd.mpi_request_index.resize(dd.Get_size()); for(int i=0;i<dd.Get_size();i++){ @@ -218,10 +247,6 @@ void Atoms::share(const std::set<AtomNumber>& unique){ } } } - virial.zero(); - for(unsigned i=0;i<gatindex.size();i++) forces[gatindex[i]].zero(); - for(unsigned i=getNatoms();i<positions.size();i++) forces[i].zero(); // virtual atoms - forceOnEnergy=0.0; } void Atoms::wait(){ @@ -238,9 +263,10 @@ void Atoms::wait(){ if(collectEnergy) energy=md_energy; if(dd && int(gatindex.size())<natoms){ + if(collectEnergy) dd.Sum(&energy,1); // receive toBeReceived - Communicator::Status status; - if(dd.async){ + if(asyncSent){ + Communicator::Status status; int count=0; for(int i=0;i<dd.Get_size();i++){ dd.Recv(&dd.indexToBeReceived[count],dd.indexToBeReceived.size()-count,i,666,status); @@ -257,6 +283,7 @@ void Atoms::wait(){ charges[dd.indexToBeReceived[i]] =dd.positionsToBeReceived[ndata*i+4]; } } + asyncSent=false; } if(collectEnergy) dd.Sum(energy); } @@ -304,6 +331,7 @@ void Atoms::DomainDecomposition::enable(Communicator& c){ on=true; Set_comm(c.Get_comm()); async=Get_size()<10; +// async=false; } void Atoms::setAtomsNlocal(int n){ @@ -321,8 +349,10 @@ void Atoms::setAtomsGatindex(int*g){ ddStep=plumed.getStep(); plumed_massert( g || gatindex.size()==0, "NULL gatindex pointer with non-zero local atoms"); for(unsigned i=0;i<gatindex.size();i++) gatindex[i]=g[i]; - for(unsigned i=0;i<dd.g2l.size();i++) dd.g2l[i]=-1; - if(dd) for(unsigned i=0;i<gatindex.size();i++) dd.g2l[gatindex[i]]=i; + if(dd){ + for(unsigned i=0;i<dd.g2l.size();i++) dd.g2l[i]=-1; + for(unsigned i=0;i<gatindex.size();i++) dd.g2l[gatindex[i]]=i; + } } void Atoms::setAtomsContiguous(int start){ @@ -461,13 +491,13 @@ double Atoms::getMDKBoltzmann()const{ } void Atoms::getLocalPositions(std::vector<Vector>& localPositions){ - mdatoms->getPositions(gatindex,positions); localPositions.resize(gatindex.size()); - for(int i=0; i<gatindex.size();i++) localPositions[i] = positions[gatindex[i]]; + mdatoms->getLocalPositions(localPositions); } void Atoms::getLocalForces(std::vector<Vector>& localForces){ localForces.resize(gatindex.size()); +#pragma omp parallel for num_threads(OpenMP::getGoodNumThreads(localForces)) for(int i=0; i<gatindex.size(); i++) localForces[i] = forces[gatindex[i]]; } diff --git a/src/core/Atoms.h b/src/core/Atoms.h index 92e297979..843e77ecf 100644 --- a/src/core/Atoms.h +++ b/src/core/Atoms.h @@ -95,6 +95,9 @@ class Atoms std::vector<const ActionAtomistic*> actions; std::vector<int> gatindex; + bool asyncSent; + bool atomsNeeded; + class DomainDecomposition: public Communicator { diff --git a/src/core/MDAtoms.cpp b/src/core/MDAtoms.cpp index 26cb4a467..251060825 100644 --- a/src/core/MDAtoms.cpp +++ b/src/core/MDAtoms.cpp @@ -23,12 +23,17 @@ #include <string> #include "MDAtoms.h" #include "tools/Tools.h" +#include "tools/OpenMP.h" #include "tools/Exception.h" +#include "PlumedMain.h" + using namespace std; namespace PLMD { + PlumedMain*plumed; + /// Class containing the pointers to the MD data /// It is templated so that single and double precision versions coexist /// IT IS STILL UNDOCUMENTED. IT PROBABLY NEEDS A STRONG CLEANUP @@ -36,8 +41,9 @@ template <class T> class MDAtomsTyped: public MDAtomsBase { - double scalep,scalef; - double scaleb,scalev; + PlumedMain*plumed; + T scalep,scalef; + T scaleb,scalev; int stride; T *m; T *c; @@ -47,6 +53,8 @@ public MDAtomsBase T *virial; public: MDAtomsTyped(); +/// Link to a plumedMain object + void link(PlumedMain&p); void setm(void*m); void setc(void*m); void setBox(void*); @@ -64,6 +72,8 @@ public: } void getBox(Tensor &)const; void getPositions(const vector<int>&index,vector<Vector>&positions)const; + void getPositions(unsigned j,unsigned k,vector<Vector>&positions)const; + void getLocalPositions(std::vector<Vector>&p)const; void getMasses(const vector<int>&index,vector<double>&)const; void getCharges(const vector<int>&index,vector<double>&)const; void updateVirial(const Tensor&)const; @@ -92,22 +102,38 @@ void MDAtomsTyped<T>::getBox(Tensor&box)const{ template <class T> void MDAtomsTyped<T>::getPositions(const vector<int>&index,vector<Vector>&positions)const{ - if(positions.size()==index.size()){ -#pragma omp parallel for - for(unsigned i=0;i<index.size();++i){ - positions[index[i]][0]=px[stride*i]*scalep; - positions[index[i]][1]=py[stride*i]*scalep; - positions[index[i]][2]=pz[stride*i]*scalep; - } - } else { - for(unsigned i=0;i<index.size();++i){ - positions[index[i]][0]=px[stride*i]*scalep; - positions[index[i]][1]=py[stride*i]*scalep; - positions[index[i]][2]=pz[stride*i]*scalep; - } +// cannot be parallelized with omp because access to positions is not ordered + for(unsigned i=0;i<index.size();++i){ + positions[index[i]][0]=px[stride*i]*scalep; + positions[index[i]][1]=py[stride*i]*scalep; + positions[index[i]][2]=pz[stride*i]*scalep; + } +} + +template <class T> +void MDAtomsTyped<T>::getPositions(unsigned j,unsigned k,vector<Vector>&positions)const{ + unsigned nt=1; + if(k>j) nt=OpenMP::getGoodNumThreads(&positions[j],(k-j)); +#pragma omp parallel for num_threads(nt) + for(unsigned i=j;i<k;++i){ + positions[i][0]=px[stride*i]*scalep; + positions[i][1]=py[stride*i]*scalep; + positions[i][2]=pz[stride*i]*scalep; + } +} + + +template <class T> +void MDAtomsTyped<T>::getLocalPositions(vector<Vector>&positions)const{ +#pragma omp parallel for num_threads(OpenMP::getGoodNumThreads(positions)) + for(unsigned i=0;i<positions.size();++i){ + positions[i][0]=px[stride*i]*scalep; + positions[i][1]=py[stride*i]*scalep; + positions[i][2]=pz[stride*i]*scalep; } } + template <class T> void MDAtomsTyped<T>::getMasses(const vector<int>&index,vector<double>&masses)const{ if(m) for(unsigned i=0;i<index.size();++i) masses[index[i]]=m[i]; @@ -127,18 +153,32 @@ void MDAtomsTyped<T>::updateVirial(const Tensor&virial)const{ template <class T> void MDAtomsTyped<T>::updateForces(const vector<int>&index,const vector<Vector>&forces){ -#pragma omp parallel for + unsigned ntx=1,nty=1,ntz=1; + if(index.size()>0) ntx=OpenMP::getGoodNumThreads(&fx[0],index.size()); + if(index.size()>0) nty=OpenMP::getGoodNumThreads(&fy[0],index.size()); + if(index.size()>0) ntz=OpenMP::getGoodNumThreads(&fz[0],index.size()); + unsigned nt=ntx; + if(nty<nt) nt=nty; + if(ntz<nt) nt=ntz; +#pragma omp parallel for num_threads(nt) for(unsigned i=0;i<index.size();++i){ - fx[stride*i]+=T(scalef*forces[index[i]][0]); - fy[stride*i]+=T(scalef*forces[index[i]][1]); - fz[stride*i]+=T(scalef*forces[index[i]][2]); + fx[stride*i]+=scalef*T(forces[index[i]][0]); + fy[stride*i]+=scalef*T(forces[index[i]][1]); + fz[stride*i]+=scalef*T(forces[index[i]][2]); } } template <class T> void MDAtomsTyped<T>::rescaleForces(const vector<int>&index,double factor){ if(virial) for(unsigned i=0;i<3;i++)for(unsigned j=0;j<3;j++) virial[3*i+j]*=T(factor); -#pragma omp parallel for + unsigned ntx=1,nty=1,ntz=1; + if(index.size()>0) ntx=OpenMP::getGoodNumThreads(&fx[0],index.size()); + if(index.size()>0) nty=OpenMP::getGoodNumThreads(&fy[0],index.size()); + if(index.size()>0) ntz=OpenMP::getGoodNumThreads(&fz[0],index.size()); + unsigned nt=ntx; + if(nty<nt) nt=nty; + if(ntz<nt) nt=ntz; +#pragma omp parallel for num_threads(nt) for(unsigned i=0;i<index.size();++i){ fx[stride*i]*=T(factor); fy[stride*i]*=T(factor); @@ -215,6 +255,7 @@ void MDAtomsTyped<T>::setc(void*c){ template <class T> MDAtomsTyped<T>::MDAtomsTyped(): + plumed(NULL), scalep(1.0), scalef(1.0), scaleb(1.0), @@ -244,5 +285,10 @@ MDAtomsBase* MDAtomsBase::create(unsigned p){ return NULL; } +template <class T> +void MDAtomsTyped<T>::link(PlumedMain&p){ + plumed=&p; +} + } diff --git a/src/core/MDAtoms.h b/src/core/MDAtoms.h index 20c040043..520f3bd2f 100644 --- a/src/core/MDAtoms.h +++ b/src/core/MDAtoms.h @@ -81,12 +81,16 @@ public: /// Retrieve selected positions. /// The operation is done in such a way that p[index[i]] is equal to the coordinates of atom i virtual void getPositions(const std::vector<int>&index,std::vector<Vector>&p)const=0; +/// Retrieve all atom positions from index i to index j. + virtual void getPositions(unsigned i,unsigned j,std::vector<Vector>&p)const=0; /// Retrieve selected masses. /// The operation is done in such a way that m[index[i]] is equal to the mass of atom i virtual void getMasses(const std::vector<int>&index,std::vector<double>&m)const=0; /// Retrieve selected charges. /// The operation is done in such a way that c[index[i]] is equal to the charge of atom i virtual void getCharges(const std::vector<int>&index,std::vector<double>&c)const=0; +/// Retrieve local positions. + virtual void getLocalPositions(std::vector<Vector>&p)const=0; /// Increment the virial by an amount v virtual void updateVirial(const Tensor&v)const=0; /// Increment the force on selected atoms. diff --git a/src/core/PlumedMain.cpp b/src/core/PlumedMain.cpp index 21c72035c..a6c4d0cf5 100644 --- a/src/core/PlumedMain.cpp +++ b/src/core/PlumedMain.cpp @@ -21,6 +21,7 @@ +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ #include "PlumedMain.h" #include "tools/Tools.h" +#include "tools/OpenMP.h" #include <cstring> #include "ActionPilot.h" #include "ActionWithValue.h" @@ -467,6 +468,8 @@ void PlumedMain::init(){ log.printf("Molecular dynamics engine: %s\n",MDEngine.c_str()); log.printf("Precision of reals: %d\n",atoms.getRealPrecision()); log.printf("Running over %d %s\n",comm.Get_size(),(comm.Get_size()>1?"nodes":"node")); + log<<"Number of threads: "<<OpenMP::getNumThreads()<<"\n"; + log<<"Cache line size: "<<OpenMP::getCachelineSize()<<"\n"; log.printf("Number of atoms: %d\n",atoms.getNatoms()); if(grex) log.printf("GROMACS-like replica exchange is on\n"); log.printf("File suffix: %s\n",getSuffix().c_str()); diff --git a/src/core/PlumedMain.h b/src/core/PlumedMain.h index 659b05ecb..8198f07e1 100644 --- a/src/core/PlumedMain.h +++ b/src/core/PlumedMain.h @@ -153,6 +153,17 @@ public: /// word list command std::map<std::string, int> word_map; +/// Get number of threads that can be used by openmp + unsigned getNumThreads()const; + +/// Get a reasonable number of threads so as to access to an array of size s located at x + template<typename T> + unsigned getGoodNumThreads(const T*x,unsigned s)const; + +/// Get a reasonable number of threads so as to access to vector v; + template<typename T> + unsigned getGoodNumThreads(const std::vector<T> & v)const; + public: PlumedMain(); // this is to access to WithCmd versions of cmd (allowing overloading of a virtual method) diff --git a/src/tools/OpenMP.cpp b/src/tools/OpenMP.cpp new file mode 100644 index 000000000..a808650df --- /dev/null +++ b/src/tools/OpenMP.cpp @@ -0,0 +1,24 @@ + +#include "OpenMP.h" +#include "Tools.h" +#include <cstdlib> + +namespace PLMD{ + + +void OpenMP::init(){ + if(std::getenv("PLUMED_NUM_THREADS")) Tools::convert(std::getenv("PLUMED_NUM_THREADS"),numThreads); + if(std::getenv("PLUMED_CACHELINE_SIZE")) Tools::convert(std::getenv("PLUMED_CACHELINE_SIZE"),cachelineSize); + initialized=true; +} + +bool OpenMP::initialized=false; + +unsigned OpenMP::numThreads=1; + +unsigned OpenMP::cachelineSize=512; + +} + + + diff --git a/src/tools/OpenMP.h b/src/tools/OpenMP.h new file mode 100644 index 000000000..9545d084b --- /dev/null +++ b/src/tools/OpenMP.h @@ -0,0 +1,93 @@ +/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + Copyright (c) 2013 The plumed team + (see the PEOPLE file at the root of the distribution for a list of names) + + See http://www.plumed-code.org for more information. + + This file is part of plumed, version 2.0. + + plumed is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + plumed is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with plumed. If not, see <http://www.gnu.org/licenses/>. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ +#ifndef __PLUMED_tools_OpenMP_h +#define __PLUMED_tools_OpenMP_h + +#include <vector> + +namespace PLMD{ + +class OpenMP{ + +/// Number of threads that can be used by openmp +static unsigned numThreads; +/// Empirical cacheline size (to be used to estimate correct number of threads) +static unsigned cachelineSize; + +static bool initialized; + +static void init(); + +public: + +/// Get number of threads that can be used by openMP +static unsigned getNumThreads(); + +/// get cacheline size +static unsigned getCachelineSize(); + +/// Get a reasonable number of threads so as to access to an array of size s located at x +template<typename T> +static unsigned getGoodNumThreads(const T*x,unsigned s); + +/// Get a reasonable number of threads so as to access to vector v; +template<typename T> +static unsigned getGoodNumThreads(const std::vector<T> & v); + +}; + +inline +unsigned OpenMP::getCachelineSize(){ + if(!initialized) init(); + return cachelineSize; +} + +inline +unsigned OpenMP::getNumThreads(){ + if(!initialized) init(); + return numThreads; +} + +template<typename T> +unsigned OpenMP::getGoodNumThreads(const T*x,unsigned n){ + if(!initialized) init(); + unsigned long p=(unsigned long) x; +// a factor two is necessary since there is no guarantee that x is aligned +// to cache line boundary + unsigned m=n/(2*cachelineSize*sizeof(T)); + if(m>numThreads) m=numThreads; + if(m==0) m=1; + return m; +} + + +template<typename T> +unsigned OpenMP::getGoodNumThreads(const std::vector<T> & v){ + if(!initialized) init(); + if(v.size()==0) return 0; + else return getGoodNumThreads(&v[0],v.size()); +} + + +} + +#endif -- GitLab