n2p2 - A neural network potential package
Training.h
Go to the documentation of this file.
1// n2p2 - A neural network potential package
2// Copyright (C) 2018 Andreas Singraber (University of Vienna)
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17#ifndef TRAINING_H
18#define TRAINING_H
19
20#include "Atom.h"
21#include "Dataset.h"
22#include "Stopwatch.h"
23#include "Updater.h"
24#include <cstddef> // std::size_t
25#include <fstream> // std::ofstream
26#include <map> // std::map
27#include <random> // std::mt19937_64
28#include <string> // std::string
29#include <vector> // std::vector
30
31namespace nnp
32{
33
35class Training : public Dataset
36{
37public:
40 {
46 UT_LM
47 };
48
56 {
63 //PM_DATASET,
64 /* Parallel gradient computation, update on rank 0.
65 *
66 * Data set is distributed via MPI, each tasks selects energy/force
67 * update candidates, and computes errors and gradients, which are
68 * collected on rank 0. Weight update is carried out on rank 0 and new
69 * weights are redistributed to all tasks.
70 */
81 };
82
85 {
92 };
93
96 {
101 };
102
105 {
112 };
113
115 Training();
117 ~Training();
122 void selectSets();
126 void writeSetsToFiles();
129 void initializeWeights();
135 = US_COMBINED);
140 void setStage(std::size_t stage);
146 void setupTraining();
149 std::vector<
150 std::string> setupNumericDerivCheck();
161 void calculateError(
162 std::map<std::string,
163 std::pair<
164 std::string, std::string>> const fileNames);
170 void calculateErrorEpoch();
173 void printHeader();
176 void printEpoch();
182 void writeWeights(
183 std::string const& nnName,
184 std::string const& fileNameFormat) const;
187 void writeWeightsEpoch() const;
193 void writeLearningCurve(bool append,
194 std::string const fileName
195 = "learning-curve.out") const;
202 std::string const& nnName,
203 std::string const& fileName) const;
206 void writeNeuronStatisticsEpoch() const;
215 void writeUpdaterStatus(bool append,
216 std::string const fileNameFormat
217 = "updater.%03zu.out") const;
222 void sortUpdateCandidates(std::string const& property);
227 void shuffleUpdateCandidates(std::string const& property);
230 void checkSelectionMode();
233 void loop();
236 void setEpochSchedule();
241 void update(std::string const& property);
252 double getSingleWeight(std::size_t element,
253 std::size_t index);
263 void setSingleWeight(std::size_t element,
264 std::size_t index,
265 double value);
275 std::vector<
276 std::vector<double> > calculateWeightDerivatives(Structure* structure);
288 std::vector<
289 std::vector<double> > calculateWeightDerivatives(Structure* structure,
290 std::size_t atom,
291 std::size_t component);
296 void setTrainingLogFileName(std::string fileName);
303 std::size_t getNumConnections(std::string id = "short") const;
310 std::vector<
311 std::size_t> getNumConnectionsPerElement(
312 std::string id = "short") const;
319 std::vector<
320 std::size_t> getConnectionOffsets(std::string id = "short") const;
329 void dPdc(std::string property,
330 Structure& structure,
331 std::vector<std::vector<double>>& dEdc);
341 void dPdcN(
342 std::string property,
343 Structure& structure,
344 std::vector<std::vector<double>>& dEdc,
345 double delta = 1.0E-4);
346
347private:
350 {
352 std::size_t s;
354 std::size_t a;
356 std::size_t c;
358 double error;
359
361 bool operator<(UpdateCandidate const& rhs) const {
362 return this->error > rhs.error;
363 }
364 };
365
367 struct Property
368 {
370 Property(std::string const& property);
371
373 std::string property;
375 std::string displayMetric;
377 std::string tiny;
379 std::string plural;
383 std::size_t numTrainPatterns;
385 std::size_t numTestPatterns;
387 std::size_t taskBatchSize;
389 std::size_t writeCompEvery;
391 std::size_t writeCompAlways;
397 std::size_t countUpdates;
399 std::size_t numUpdates;
401 std::size_t patternsPerUpdate;
405 std::size_t numErrorsGlobal;
411 std::vector<int> errorsPerTask;
413 std::vector<int> offsetPerTask;
415 std::vector<std::string> errorMetrics;
417 std::map<
418 std::string, double> errorTrain;
420 std::map<
421 std::string, double> errorTest;
423 std::vector<UpdateCandidate> updateCandidates;
425 std::vector<
426 std::vector<int>> weightsPerTask;
428 std::vector<
429 std::vector<int>> offsetJacobian;
431 std::vector<
432 std::vector<double>> error;
434 std::vector<
435 std::vector<double>> jacobian;
437 std::map<
439 };
440
442 struct PropertyMap : std::map<std::string, Property>
443 {
445 Property& operator[](std::string const& key) {return this->at(key);}
447 Property const& operator[](std::string const& key) const
448 {
449 return this->at(key);
450 }
452 bool exists(std::string const& key)
453 {
454 return (this->find(key) != this->end());
455 }
456 };
457
479 std::size_t stage;
481 std::size_t numUpdaters;
483 std::size_t numEpochs;
485 std::size_t epoch;
487 std::size_t writeWeightsEvery;
495 std::size_t countUpdates;
497 std::size_t numWeights;
503 std::string nnId;
505 std::ofstream trainingLog;
507 std::vector<int> epochSchedule;
509 std::vector<std::size_t> numWeightsPerUpdater;
511 std::vector<std::size_t> weightsOffset;
513 std::vector<std::string> pk;
514#ifndef N2P2_FULL_SFD_MEMORY
517 std::vector<double> dGdxia;
518#endif
520 std::vector<
521 std::vector<double> > weights;
523 std::vector<Updater*> updaters;
525 std::map<
526 std::string, Stopwatch> sw;
528 std::mt19937_64 rngNew;
530 std::mt19937_64 rngGlobalNew;
533
534
539 bool advance() const;
542 void getWeights();
545 void setWeights();
554 void addTrainingLogEntry(int proc,
555 std::size_t il,
556 double f,
557 std::size_t isg,
558 std::size_t is);
569 void addTrainingLogEntry(int proc,
570 std::size_t il,
571 double f,
572 std::size_t isg,
573 std::size_t is,
574 std::size_t ia,
575 std::size_t ic);
585 void addTrainingLogEntry(int proc,
586 std::size_t il,
587 double f,
588 std::size_t isg,
589 std::size_t is,
590 std::size_t ia);
591#ifndef N2P2_FULL_SFD_MEMORY
610 void collectDGdxia(Atom const& atom,
611 std::size_t indexAtom,
612 std::size_t indexComponent);
613#endif
618 void randomizeNeuralNetworkWeights(std::string const& type);
623 void setupSelectionMode(std::string const& property);
628 void setupFileOutput(std::string const& type);
633 void setupUpdatePlan(std::string const& property);
638 void allocateArrays(std::string const& property);
644 void writeTimingData(bool append,
645 std::string const fileName = "timing.out");
646};
647
648}
649
650#endif
Collect and process large data sets.
Definition: Dataset.h:35
Implements a simple stopwatch on different platforms.
Definition: Stopwatch.h:39
Training methods.
Definition: Training.h:36
void setTrainingLogFileName(std::string fileName)
Set training log file name.
Definition: Training.cpp:2842
UpdaterType updaterType
Updater type used.
Definition: Training.h:459
std::vector< double > dGdxia
Derivative of symmetry functions with respect to one specific atom coordinate.
Definition: Training.h:517
std::size_t epoch
Current epoch.
Definition: Training.h:485
std::vector< std::size_t > numWeightsPerUpdater
Number of weights per updater.
Definition: Training.h:509
std::size_t writeWeightsAlways
Up to this epoch weights are written every epoch.
Definition: Training.h:489
void randomizeNeuralNetworkWeights(std::string const &type)
Randomly initialize specificy neural network weights.
Definition: Training.cpp:3176
std::size_t numWeights
Total number of weights.
Definition: Training.h:497
bool useForces
Use forces for training.
Definition: Training.h:471
std::size_t writeWeightsEvery
Write weights every this many epochs.
Definition: Training.h:487
void setupTraining()
General training settings and setup of weight update routine.
Definition: Training.cpp:703
std::mt19937_64 rngGlobalNew
Global random number generator.
Definition: Training.h:530
std::string nnId
ID of neural network the training is working on.
Definition: Training.h:503
void allocateArrays(std::string const &property)
Allocate error and Jacobian arrays for given property.
Definition: Training.cpp:3515
void writeWeightsEpoch() const
Write weights to files during training loop.
Definition: Training.cpp:1576
std::ofstream trainingLog
Training log file.
Definition: Training.h:505
bool hasUpdaters
If this rank performs weight updates.
Definition: Training.h:467
void checkSelectionMode()
Check if selection mode should be changed.
Definition: Training.cpp:1961
std::string trainingLogFileName
File name for training log.
Definition: Training.h:501
JacobianMode jacobianMode
Jacobian mode used.
Definition: Training.h:463
void selectSets()
Randomly select training and test set structures.
Definition: Training.cpp:81
JacobianMode
Jacobian matrix preparation mode.
Definition: Training.h:85
@ JM_SUM
No Jacobian, sum up contributions from update candidates.
Definition: Training.h:87
@ JM_TASK
Prepare one Jacobian entry for each task, sum up within tasks.
Definition: Training.h:89
@ JM_FULL
Prepare full Jacobian matrix.
Definition: Training.h:91
std::size_t countUpdates
Update counter (for all training quantities together).
Definition: Training.h:495
std::map< std::string, Stopwatch > sw
Stopwatches for timing overview.
Definition: Training.h:526
void setupSelectionMode(std::string const &property)
Set selection mode for specific training property.
Definition: Training.cpp:3235
void dPdc(std::string property, Structure &structure, std::vector< std::vector< double > > &dEdc)
Compute derivatives of property with respect to weights.
Definition: Training.cpp:2884
void printHeader()
Print training loop header on screen.
Definition: Training.cpp:1455
std::mt19937_64 rngNew
Per-task random number generator.
Definition: Training.h:528
PropertyMap p
Actual training properties.
Definition: Training.h:532
std::vector< Updater * > updaters
Weight updater (combined or for each element).
Definition: Training.h:523
std::size_t writeNeuronStatisticsAlways
Up to this epoch neuron statistics are written every epoch.
Definition: Training.h:493
UpdaterType
Type of update routine.
Definition: Training.h:40
@ UT_GD
Simple gradient descent methods.
Definition: Training.h:42
@ UT_KF
Kalman filter-based methods.
Definition: Training.h:44
@ UT_LM
Levenberg-Marquardt algorithm.
Definition: Training.h:46
double forceWeight
Force update weight.
Definition: Training.h:499
void update(std::string const &property)
Perform one update.
Definition: Training.cpp:2096
void dataSetNormalization()
Apply normalization based on initial weights prediction.
Definition: Training.cpp:398
ParallelMode
Training parallelization mode.
Definition: Training.h:56
@ PM_TRAIN_RK0
No training parallelization, only data set distribution.
Definition: Training.h:71
@ PM_TRAIN_ALL
Parallel gradient computation, update on each task.
Definition: Training.h:80
bool writeTrainingLog
Whether training log file is written.
Definition: Training.h:477
std::size_t numUpdaters
Number of updaters (depends on update strategy).
Definition: Training.h:481
bool advance() const
Check if training loop should be continued.
Definition: Training.cpp:3038
~Training()
Destructor, updater vector needs to be cleaned.
Definition: Training.cpp:63
void sortUpdateCandidates(std::string const &property)
Sort update candidates with descending RMSE.
Definition: Training.cpp:1892
void writeNeuronStatisticsEpoch() const
Write neuron statistics during training loop.
Definition: Training.cpp:1807
SelectionMode
How update candidates are selected during Training.
Definition: Training.h:105
@ SM_RANDOM
Select candidates randomly.
Definition: Training.h:107
@ SM_THRESHOLD
Select candidates randomly with RMSE above threshold.
Definition: Training.h:111
@ SM_SORT
Sort candidates according to their RMSE and pick worst first.
Definition: Training.h:109
std::size_t stage
Training stage.
Definition: Training.h:479
void collectDGdxia(Atom const &atom, std::size_t indexAtom, std::size_t indexComponent)
Collect derivative of symmetry functions with repect to one atom's coordinate.
Definition: Training.cpp:3138
void setWeights()
Set weights in neural network class.
Definition: Training.cpp:3068
void loop()
Execute main training loop.
Definition: Training.cpp:1994
void printEpoch()
Print preferred error metric and timing information on screen.
Definition: Training.cpp:1513
std::vector< std::size_t > weightsOffset
Offset of each element's weights in combined array.
Definition: Training.h:511
std::size_t numEpochs
Number of epochs requested.
Definition: Training.h:483
void setupFileOutput(std::string const &type)
Set file output intervals for properties and other quantities.
Definition: Training.cpp:3343
std::size_t getNumConnections(std::string id="short") const
Get total number of NN connections.
Definition: Training.cpp:2849
void addTrainingLogEntry(int proc, std::size_t il, double f, std::size_t isg, std::size_t is)
Write energy update data to training log file.
Definition: Training.cpp:3093
void writeTimingData(bool append, std::string const fileName="timing.out")
Write timing data for all clocks.
Definition: Training.cpp:3553
std::size_t writeNeuronStatisticsEvery
Write neuron statistics every this many epochs.
Definition: Training.h:491
void getWeights()
Get weights from neural network class.
Definition: Training.cpp:3044
void dPdcN(std::string property, Structure &structure, std::vector< std::vector< double > > &dEdc, double delta=1.0E-4)
Compute numeric derivatives of property with respect to weights.
Definition: Training.cpp:2956
std::vector< std::string > setupNumericDerivCheck()
Set up numeric weight derivatives check.
Definition: Training.cpp:1170
void setSingleWeight(std::size_t element, std::size_t index, double value)
Set a single weight value.
Definition: Training.cpp:2744
void shuffleUpdateCandidates(std::string const &property)
Shuffle update candidates.
Definition: Training.cpp:1922
double getSingleWeight(std::size_t element, std::size_t index)
Get a single weight value.
Definition: Training.cpp:2737
void initializeWeights()
Initialize weights for all elements.
Definition: Training.cpp:259
void initializeWeightsMemory(UpdateStrategy updateStrategy=US_COMBINED)
Initialize weights vector according to update strategy.
Definition: Training.cpp:314
bool repeatedEnergyUpdates
After force update perform energy update for corresponding structure.
Definition: Training.h:473
UpdateStrategy updateStrategy
Update strategy used.
Definition: Training.h:465
void calculateNeighborLists()
Calculate neighbor lists for all structures.
Definition: Training.cpp:1206
ParallelMode parallelMode
Parallelization mode used.
Definition: Training.h:461
void writeNeuronStatistics(std::string const &nnName, std::string const &fileName) const
Write neuron statistics collected since last invocation.
Definition: Training.cpp:1708
std::vector< std::size_t > getConnectionOffsets(std::string id="short") const
Get offsets of NN connections for each element.
Definition: Training.cpp:2871
std::vector< int > epochSchedule
Update schedule epoch (false = energy update, true = force update).
Definition: Training.h:507
void writeUpdaterStatus(bool append, std::string const fileNameFormat="updater.%03zu.out") const
Write updater information to file.
Definition: Training.cpp:1857
UpdateStrategy
Update strategies available for Training.
Definition: Training.h:96
@ US_COMBINED
One combined updater for all elements.
Definition: Training.h:98
@ US_ELEMENT
Separate updaters for individual elements.
Definition: Training.h:100
std::vector< std::size_t > getNumConnectionsPerElement(std::string id="short") const
Get number of NN connections for each element.
Definition: Training.cpp:2860
void resetNeuronStatistics()
Reset neuron statistics for all elements.
Definition: Training.cpp:1847
std::vector< std::vector< double > > calculateWeightDerivatives(Structure *structure)
Calculate derivatives of energy with respect to weights.
Definition: Training.cpp:2753
void calculateError(std::map< std::string, std::pair< std::string, std::string > > const fileNames)
Calculate error metrics for all structures.
Definition: Training.cpp:1246
void writeLearningCurve(bool append, std::string const fileName="learning-curve.out") const
Write current RMSEs and epoch information to file.
Definition: Training.cpp:1595
void calculateErrorEpoch()
Calculate error metrics per epoch for all structures with file names used in training loop.
Definition: Training.cpp:1425
std::vector< std::string > pk
Vector of actually used training properties.
Definition: Training.h:513
void setEpochSchedule()
Select energies/forces schedule for one epoch.
Definition: Training.cpp:1933
bool hasStructures
If this rank holds structure information.
Definition: Training.h:469
std::vector< std::vector< double > > weights
Neural network weights and biases for each element.
Definition: Training.h:521
bool freeMemory
Free symmetry function memory after calculation.
Definition: Training.h:475
void writeSetsToFiles()
Write training and test set to separate files (train.data and test.data, same format as input....
Definition: Training.cpp:197
void writeWeights(std::string const &nnName, std::string const &fileNameFormat) const
Write weights to files (one file for each element).
Definition: Training.cpp:1559
Training()
Constructor.
Definition: Training.cpp:37
void setStage(std::size_t stage)
Set training stage (if multiple stages are needed for NNP type).
Definition: Training.cpp:361
void setupUpdatePlan(std::string const &property)
Set up how often properties are updated.
Definition: Training.cpp:3405
Definition: Atom.h:28
Storage for a single atom.
Definition: Atom.h:32
Storage for one atomic configuration.
Definition: Structure.h:34
Map of all training properties.
Definition: Training.h:443
Property const & operator[](std::string const &key) const
Overload [] operator to simplify access (const version).
Definition: Training.h:447
Property & operator[](std::string const &key)
Overload [] operator to simplify access.
Definition: Training.h:445
bool exists(std::string const &key)
Check if property is present.
Definition: Training.h:452
Specific training quantity (e.g. energies, forces, charges).
Definition: Training.h:368
std::size_t numErrorsGlobal
Global number of errors per update.
Definition: Training.h:405
std::size_t numTrainPatterns
Number of training patterns in set.
Definition: Training.h:383
std::vector< int > errorsPerTask
Errors per task for each update.
Definition: Training.h:411
std::size_t writeCompAlways
Up to this epoch comparison is written every epoch.
Definition: Training.h:391
std::string property
Copy of identifier within Property map.
Definition: Training.h:373
SelectionMode selectionMode
Selection mode for update candidates.
Definition: Training.h:381
std::size_t numUpdates
Number of desired updates per epoch.
Definition: Training.h:399
double rmseThreshold
RMSE threshold for update candidates.
Definition: Training.h:409
std::string plural
Plural string of property;.
Definition: Training.h:379
double epochFraction
Desired update fraction per epoch.
Definition: Training.h:407
std::size_t patternsPerUpdate
Patterns used per update.
Definition: Training.h:401
std::size_t taskBatchSize
Batch size for each MPI task.
Definition: Training.h:387
std::vector< std::vector< double > > error
Global error vector (per updater).
Definition: Training.h:432
std::string displayMetric
Error metric for display.
Definition: Training.h:375
std::string tiny
Tiny abbreviation string for property.
Definition: Training.h:377
std::vector< std::string > errorMetrics
Error metrics available for this property.
Definition: Training.h:415
Property(std::string const &property)
Constructor.
Definition: Training.cpp:3650
std::vector< std::vector< int > > offsetJacobian
Stride for Jacobians per task per updater.
Definition: Training.h:429
std::size_t numTestPatterns
Number of training patterns in set.
Definition: Training.h:385
std::size_t countUpdates
Counter for updates per epoch.
Definition: Training.h:397
std::size_t writeCompEvery
Write comparison every this many epochs.
Definition: Training.h:389
std::vector< std::vector< int > > weightsPerTask
Weights per task per updater.
Definition: Training.h:426
std::vector< int > offsetPerTask
Offset for combined error per task.
Definition: Training.h:413
std::vector< UpdateCandidate > updateCandidates
Vector with indices of training patterns.
Definition: Training.h:423
std::map< std::string, double > errorTrain
Current error metrics of training patterns.
Definition: Training.h:418
std::vector< std::vector< double > > jacobian
Global Jacobian (per updater).
Definition: Training.h:435
std::size_t patternsPerUpdateGlobal
Patterns used per update (summed over all MPI tasks).
Definition: Training.h:403
std::map< std::string, double > errorTest
Current error metrics of test patterns.
Definition: Training.h:421
std::map< std::size_t, SelectionMode > selectionModeSchedule
Schedule for varying selection mode.
Definition: Training.h:438
std::size_t rmseThresholdTrials
Maximum trials for SM_THRESHOLD selection mode.
Definition: Training.h:395
std::size_t posUpdateCandidates
Current position in update candidate list (SM_SORT).
Definition: Training.h:393
Contains location of one update candidate (energy or force).
Definition: Training.h:350
bool operator<(UpdateCandidate const &rhs) const
Overload < operator to sort in descending order.
Definition: Training.h:361
std::size_t a
Atom index (only used for force candidates).
Definition: Training.h:354
double error
Absolute value of error with respect to reference value.
Definition: Training.h:358
std::size_t c
Component index (x,y,z -> 0,1,2, only used for force candidates).
Definition: Training.h:356
std::size_t s
Structure index.
Definition: Training.h:352