n2p2 - A neural network potential package
Training.cpp
Go to the documentation of this file.
1// n2p2 - A neural network potential package
2// Copyright (C) 2018 Andreas Singraber (University of Vienna)
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17#include "Training.h"
18#include "GradientDescent.h"
19#include "KalmanFilter.h"
20#include "NeuralNetwork.h"
21#include "utility.h"
22#include "mpi-extra.h"
23#ifdef _OPENMP
24#include <omp.h>
25#endif
26#include <algorithm> // std::sort, std::fill, std::find
27#include <cmath> // fabs
28#include <cstdlib> // atoi
29#include <gsl/gsl_rng.h>
30#include <limits> // std::numeric_limits
31#include <stdexcept> // std::runtime_error, std::range_error
32#include <utility> // std::piecewise_construct, std::forward_as_tuple
33
34using namespace std;
35using namespace nnp;
36
37Training::Training() : Dataset(),
38 updaterType (UT_GD ),
39 parallelMode (PM_TRAIN_RK0 ),
40 jacobianMode (JM_SUM ),
41 updateStrategy (US_COMBINED ),
42 hasUpdaters (false ),
43 hasStructures (false ),
44 useForces (false ),
45 repeatedEnergyUpdates (false ),
46 freeMemory (false ),
47 writeTrainingLog (false ),
48 stage (0 ),
49 numUpdaters (0 ),
50 numEpochs (0 ),
51 epoch (0 ),
52 writeWeightsEvery (0 ),
53 writeWeightsAlways (0 ),
54 writeNeuronStatisticsEvery (0 ),
55 writeNeuronStatisticsAlways(0 ),
56 numWeights (0 ),
57 forceWeight (0.0 ),
58 trainingLogFileName ("train-log.out")
59{
60 sw["setup"].start();
61}
62
64{
65 for (vector<Updater*>::iterator it = updaters.begin();
66 it != updaters.end(); ++it)
67 {
68 if (updaterType == UT_GD)
69 {
70 delete dynamic_cast<GradientDescent*>(*it);
71 }
72 else if (updaterType == UT_KF)
73 {
74 delete dynamic_cast<KalmanFilter*>(*it);
75 }
76 }
77
78 if (trainingLog.is_open()) trainingLog.close();
79}
80
82{
83 log << "\n";
84 log << "*** DEFINE TRAINING/TEST SETS ***********"
85 "**************************************\n";
86 log << "\n";
87
88 vector<string> pCheck = {"force", "charge"};
89 bool check = false;
90 for (auto k : pk)
91 {
92 check |= (find(pCheck.begin(), pCheck.end(), k) != pCheck.end());
93 }
94 vector<size_t> numAtomsPerElement(numElements, 0);
95
96 double testSetFraction = atof(settings["test_fraction"].c_str());
97 log << strpr("Desired test set ratio : %f\n", testSetFraction);
98 if (structures.size() > 0) hasStructures = true;
99 else hasStructures = false;
100
101 string k;
102 for (size_t i = 0; i < structures.size(); ++i)
103 {
104 Structure& s = structures.at(i);
105 // Only select set if not already determined.
107 {
108 double const r = gsl_rng_uniform(rng);
109 if (r < testSetFraction) s.sampleType = Structure::ST_TEST;
111 }
113 {
114 size_t const& na = s.numAtoms;
115 k = "energy"; if (p.exists(k)) p[k].numTestPatterns++;
116 k = "force"; if (p.exists(k)) p[k].numTestPatterns += 3 * na;
117 k = "charge"; if (p.exists(k)) p[k].numTestPatterns += na;
118 }
120 {
121 for (size_t j = 0; j < numElements; ++j)
122 {
123 numAtomsPerElement.at(j) += s.numAtomsPerElement.at(j);
124 }
125
126 k = "energy";
127 if (p.exists(k))
128 {
129 p[k].numTrainPatterns++;
130 p[k].updateCandidates.push_back(UpdateCandidate());
131 p[k].updateCandidates.back().s = i;
132 }
133 k = "force";
134 if (p.exists(k))
135 {
136 p[k].numTrainPatterns += 3 * s.numAtoms;
137 for (vector<Atom>::const_iterator it = s.atoms.begin();
138 it != s.atoms.end(); ++it)
139 {
140 for (size_t j = 0; j < 3; ++j)
141 {
142 p[k].updateCandidates.push_back(UpdateCandidate());
143 p[k].updateCandidates.back().s = i;
144 p[k].updateCandidates.back().a = it->index;
145 p[k].updateCandidates.back().c = j;
146 }
147 }
148 }
149 k = "charge";
150 if (p.exists(k))
151 {
152 p[k].numTrainPatterns += s.numAtoms;
153 for (vector<Atom>::const_iterator it = s.atoms.begin();
154 it != s.atoms.end(); ++it)
155 {
156 p[k].updateCandidates.push_back(UpdateCandidate());
157 p[k].updateCandidates.back().s = i;
158 p[k].updateCandidates.back().a = it->index;
159 }
160 }
161 }
162 else
163 {
164 log << strpr("WARNING: Structure %zu not assigned to either "
165 "training or test set.\n", s.index);
166 }
167 }
168 for (size_t i = 0; i < numElements; ++i)
169 {
170 if (hasStructures && (numAtomsPerElement.at(i) == 0))
171 {
172 log << strpr("WARNING: Process %d has no atoms of element "
173 "%d (%2s).\n",
174 myRank,
175 i,
176 elementMap[i].c_str());
177 }
178 }
179 for (auto k : pk)
180 {
181 MPI_Allreduce(MPI_IN_PLACE, &(p[k].numTrainPatterns), 1, MPI_SIZE_T, MPI_SUM, comm);
182 MPI_Allreduce(MPI_IN_PLACE, &(p[k].numTestPatterns) , 1, MPI_SIZE_T, MPI_SUM, comm);
183 double sum = p[k].numTrainPatterns + p[k].numTestPatterns;
184 log << "Training/test split of data set for property \"" + k + "\":\n";
185 log << strpr("- Total patterns : %.0f\n", sum);
186 log << strpr("- Training patterns : %d\n", p[k].numTrainPatterns);
187 log << strpr("- Test patterns : %d\n", p[k].numTestPatterns);
188 log << strpr("- Test set fraction : %f\n", p[k].numTestPatterns / sum);
189 }
190
191 log << "*****************************************"
192 "**************************************\n";
193
194 return;
195}
196
198{
199 log << "\n";
200 log << "*** WRITE TRAINING/TEST SETS ************"
201 "**************************************\n";
202 log << "\n";
203
204 string fileName = strpr("train.data.%04d", myRank);
205 ofstream fileTrain;
206 fileTrain.open(fileName.c_str());
207 if (!fileTrain.is_open())
208 {
209 runtime_error(strpr("ERROR: Could not open file %s\n",
210 fileName.c_str()));
211 }
212 fileName = strpr("test.data.%04d", myRank);
213 ofstream fileTest;
214 fileTest.open(fileName.c_str());
215 if (!fileTest.is_open())
216 {
217 runtime_error(strpr("ERROR: Could not open file %s\n",
218 fileName.c_str()));
219 }
220 for (vector<Structure>::iterator it = structures.begin();
221 it != structures.end(); ++it)
222 {
223 // Energy offsets are already subtracted at this point.
224 // Here, we quickly add them again to provide consistent data sets.
225 addEnergyOffset(*it);
226 if (it->sampleType == Structure::ST_TRAINING)
227 {
228 it->writeToFile(&fileTrain);
229 }
230 else if (it->sampleType == Structure::ST_TEST)
231 {
232 it->writeToFile(&fileTest);
233 }
234 // Subtract energy offsets again.
236 }
237 fileTrain.flush();
238 fileTrain.close();
239 fileTest.flush();
240 fileTest.close();
241 MPI_Barrier(comm);
242 if (myRank == 0)
243 {
244 log << "Writing training/test set to files:\n";
245 log << " - train.data\n";
246 log << " - test.data\n";
247 fileName = "train.data";
248 combineFiles(fileName);
249 fileName = "test.data";
250 combineFiles(fileName);
251 }
252
253 log << "*****************************************"
254 "**************************************\n";
255
256 return;
257}
258
260{
261 log << "\n";
262 log << "*** WEIGHT INITIALIZATION ***************"
263 "**************************************\n";
264 log << "\n";
265
266 if (settings.keywordExists("nguyen_widrow_weights_short") &&
267 settings.keywordExists("precondition_weights"))
268 {
269 throw runtime_error("ERROR: Nguyen Widrow and preconditioning weights"
270 " initialization are incompatible\n");
271 }
272
273 // Charge NN.
275 {
276 log << "Setting up charge neural networks:\n";
277 if ((stage == 1 && settings.keywordExists("use_old_weights_charge")) ||
278 (stage == 2))
279 {
280 log << "Reading old weights from files.\n";
281 readNeuralNetworkWeights("charge", "weightse.%03zu.data");
282 }
283 else randomizeNeuralNetworkWeights("charge");
284 }
285
286 // Short-range NN.
288 {
289 log << "Setting up short-range neural networks:\n";
290 if (settings.keywordExists("use_old_weights_short"))
291 {
292 log << "Reading old weights from files.\n";
293 readNeuralNetworkWeights("short", "weights.%03zu.data");
294 }
295 else randomizeNeuralNetworkWeights("short");
296 }
297 else if(nnpType == NNPType::SHORT_CHARGE_NN && stage == 2)
298 {
299 log << "Setting up short-range neural networks:\n";
300 if (settings.keywordExists("use_old_weights_short"))
301 {
302 log << "Reading old weights from files.\n";
303 readNeuralNetworkWeights("short", "weights.%03zu.data");
304 }
305 else randomizeNeuralNetworkWeights("short");
306 }
307
308 log << "*****************************************"
309 "**************************************\n";
310
311 return;
312}
313
315{
316 this->updateStrategy = updateStrategy;
317 numWeights= 0;
319 {
320 log << strpr("Combined updater for all elements selected: "
321 "UpdateStrategy::US_COMBINED (%d)\n", updateStrategy);
322 numUpdaters = 1;
323 log << strpr("Number of weight updaters : %zu\n", numUpdaters);
324 for (size_t i = 0; i < numElements; ++i)
325 {
326 weightsOffset.push_back(numWeights);
327 numWeights += elements.at(i).neuralNetworks.at(nnId)
328 .getNumConnections();
329 }
330 weights.resize(numUpdaters);
331 weights.at(0).resize(numWeights, 0.0);
333 log << strpr("Total fit parameters : %zu\n", numWeights);
334 }
335 else if (updateStrategy == US_ELEMENT)
336 {
337 log << strpr("Separate updaters for elements selected: "
338 "UpdateStrategy::US_ELEMENT (%d)\n", updateStrategy);
340 log << strpr("Number of weight updaters : %zu\n", numUpdaters);
341 weights.resize(numUpdaters);
342 for (size_t i = 0; i < numUpdaters; ++i)
343 {
344 size_t n = elements.at(i).neuralNetworks.at(nnId)
345 .getNumConnections();
346 weights.at(i).resize(n, 0.0);
347 numWeightsPerUpdater.push_back(n);
348 log << strpr("Fit parameters for element %2s: %zu\n",
349 elements.at(i).getSymbol().c_str(),
350 n);
351 }
352 }
353 else
354 {
355 throw runtime_error("ERROR: Unknown update strategy.\n");
356 }
357
358 return;
359}
360
361void Training::setStage(size_t stage)
362{
363 this->stage = stage;
364
365 // NNP of type SHORT_ONLY requires:
366 // * "energy" (optionally: "force")
368 {
369 pk.push_back("energy");
370 if (settings.keywordExists("use_short_forces")) pk.push_back("force");
371 }
372 // NNP of type SHORT_CHARGE_NN requires:
373 // * stage 1: "charge"
374 // * stage 2: "energy" (optionally: "force")
376 {
377 if (stage == 1) pk.push_back("charge");
378 else if (stage == 2)
379 {
380 pk.push_back("energy");
381 if (settings.keywordExists("use_short_forces"))
382 {
383 pk.push_back("force");
384 }
385 }
386 }
387
388 // Initialize all training properties which will be used.
389 auto initP = [this](string key) {p.emplace(piecewise_construct,
390 forward_as_tuple(key),
391 forward_as_tuple(key));
392 };
393 for (auto k : pk) initP(k);
394
395 return;
396}
397
399{
400 log << "\n";
401 log << "*** DATA SET NORMALIZATION **************"
402 "**************************************\n";
403 log << "\n";
404
405 log << "Computing statistics from reference data and initial "
406 "prediction...\n";
407 log << "\n";
408
409 bool useForcesLocal = settings.keywordExists("use_short_forces");
410
412 {
413 throw runtime_error("ERROR: Normalization of charges not yet "
414 "implemented\n.");
415 }
416 writeWeights("short", "weights.%03zu.norm");
418 {
419 writeWeights("charge", "weightse.%03zu.norm");
420 }
421
422 ofstream fileEvsV;
423 fileEvsV.open(strpr("evsv.dat.%04d", myRank).c_str());
424 if (myRank == 0)
425 {
426 // File header.
427 vector<string> title;
428 vector<string> colName;
429 vector<string> colInfo;
430 vector<size_t> colSize;
431 title.push_back("Energy vs. volume comparison.");
432 colSize.push_back(16);
433 colName.push_back("V_atom");
434 colInfo.push_back("Volume per atom.");
435 colSize.push_back(16);
436 colName.push_back("Eref_atom");
437 colInfo.push_back("Reference energy per atom.");
438 colSize.push_back(10);
439 colName.push_back("N");
440 colInfo.push_back("Number of atoms.");
441 colSize.push_back(16);
442 colName.push_back("V");
443 colInfo.push_back("Volume of structure.");
444 colSize.push_back(16);
445 colName.push_back("Eref");
446 colInfo.push_back("Reference energy of structure.");
447 colSize.push_back(16);
448 colName.push_back("Eref_offset");
449 colInfo.push_back("Reference energy of structure (including offset).");
450 appendLinesToFile(fileEvsV,
451 createFileHeader(title, colSize, colName, colInfo));
452 }
453
454 size_t numAtomsTotal = 0;
455 size_t numStructures = 0;
456 double meanEnergyPerAtomRef = 0.0;
457 double meanEnergyPerAtomNnp = 0.0;
458 double sigmaEnergyPerAtomRef = 0.0;
459 double sigmaEnergyPerAtomNnp = 0.0;
460 double meanForceRef = 0.0;
461 double meanForceNnp = 0.0;
462 double sigmaForceRef = 0.0;
463 double sigmaForceNnp = 0.0;
464 for (auto& s : structures)
465 {
466 // File output for evsv.dat.
467 fileEvsV << strpr("%16.8E %16.8E %10zu %16.8E %16.8E %16.8E\n",
468 s.volume / s.numAtoms,
469 s.energyRef / s.numAtoms,
470 s.numAtoms,
471 s.volume,
472 s.energyRef,
473 getEnergyWithOffset(s, true));
474 s.calculateNeighborList(maxCutoffRadius);
475#ifdef N2P2_NO_SF_GROUPS
477#else
479#endif
482 if (useForcesLocal) calculateForces(s);
483 s.clearNeighborList();
484
486 numAtomsTotal += s.numAtoms;
487 meanEnergyPerAtomRef += s.energyRef / s.numAtoms;
488 meanEnergyPerAtomNnp += s.energy / s.numAtoms;
489 for (auto& a : s.atoms)
490 {
491 meanForceRef += a.fRef[0] + a.fRef[1] + a.fRef[2];
492 meanForceNnp += a.f [0] + a.f [1] + a.f [2];
493 }
494 }
495
496 fileEvsV.flush();
497 fileEvsV.close();
498 MPI_Barrier(MPI_COMM_WORLD);
499 log << "Writing energy/atom vs. volume/atom data to \"evsv.dat\".\n";
500 if (myRank == 0) combineFiles("evsv.dat");
501 MPI_Allreduce(MPI_IN_PLACE, &numStructures , 1, MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);
502 MPI_Allreduce(MPI_IN_PLACE, &numAtomsTotal , 1, MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);
503 MPI_Allreduce(MPI_IN_PLACE, &meanEnergyPerAtomRef, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
504 MPI_Allreduce(MPI_IN_PLACE, &meanEnergyPerAtomNnp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
505 MPI_Allreduce(MPI_IN_PLACE, &meanForceRef , 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
506 MPI_Allreduce(MPI_IN_PLACE, &meanForceNnp , 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
507 meanEnergyPerAtomRef /= numStructures;
508 meanEnergyPerAtomNnp /= numStructures;
509 meanForceRef /= 3 * numAtomsTotal;
510 meanForceNnp /= 3 * numAtomsTotal;
511 for (auto const& s : structures)
512 {
513 double ediffRef = s.energyRef / s.numAtoms - meanEnergyPerAtomRef;
514 double ediffNnp = s.energy / s.numAtoms - meanEnergyPerAtomNnp;
515 sigmaEnergyPerAtomRef += ediffRef * ediffRef;
516 sigmaEnergyPerAtomNnp += ediffNnp * ediffNnp;
517 for (auto const& a : s.atoms)
518 {
519 double fdiffRef = a.fRef[0] - meanForceRef;
520 double fdiffNnp = a.f [0] - meanForceNnp;
521 sigmaForceRef += fdiffRef * fdiffRef;
522 sigmaForceNnp += fdiffNnp * fdiffNnp;
523 fdiffRef = a.fRef[1] - meanForceRef;
524 fdiffNnp = a.f [1] - meanForceNnp;
525 sigmaForceRef += fdiffRef * fdiffRef;
526 sigmaForceNnp += fdiffNnp * fdiffNnp;
527 fdiffRef = a.fRef[2] - meanForceRef;
528 fdiffNnp = a.f [2] - meanForceNnp;
529 sigmaForceRef += fdiffRef * fdiffRef;
530 sigmaForceNnp += fdiffNnp * fdiffNnp;
531 }
532 }
533 MPI_Allreduce(MPI_IN_PLACE, &sigmaEnergyPerAtomRef, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
534 MPI_Allreduce(MPI_IN_PLACE, &sigmaEnergyPerAtomNnp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
535 MPI_Allreduce(MPI_IN_PLACE, &sigmaForceRef , 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
536 MPI_Allreduce(MPI_IN_PLACE, &sigmaForceNnp , 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
537 sigmaEnergyPerAtomRef = sqrt(sigmaEnergyPerAtomRef / (numStructures - 1));
538 sigmaEnergyPerAtomNnp = sqrt(sigmaEnergyPerAtomNnp / (numStructures - 1));
539 sigmaForceRef = sqrt(sigmaForceRef / (3 * numAtomsTotal - 1));
540 sigmaForceNnp = sqrt(sigmaForceNnp / (3 * numAtomsTotal - 1));
541 log << "\n";
542 log << strpr("Total number of structures : %zu\n", numStructures);
543 log << strpr("Total number of atoms : %zu\n", numAtomsTotal);
544 log << "----------------------------------\n";
545 log << "Reference data statistics:\n";
546 log << "----------------------------------\n";
547 log << strpr("Mean/sigma energy per atom : %16.8E / %16.8E\n",
548 meanEnergyPerAtomRef,
549 sigmaEnergyPerAtomRef);
550 log << strpr("Mean/sigma force : %16.8E / %16.8E\n",
551 meanForceRef,
552 sigmaForceRef);
553 log << "----------------------------------\n";
554 log << "Initial NNP prediction statistics:\n";
555 log << "----------------------------------\n";
556 log << strpr("Mean/sigma energy per atom : %16.8E / %16.8E\n",
557 meanEnergyPerAtomNnp,
558 sigmaEnergyPerAtomNnp);
559 log << strpr("Mean/sigma force : %16.8E / %16.8E\n",
560 meanForceNnp,
561 sigmaForceNnp);
562 log << "----------------------------------\n";
563 // Now set conversion quantities of Mode class.
564 if (settings["normalize_data_set"] == "stats-only")
565 {
566 log << "Data set statistics computation completed, now make up for \n";
567 log << "initially skipped normalization setup...\n";
568 log << "\n";
569 setupNormalization(false);
570 }
571 else if (settings["normalize_data_set"] == "ref")
572 {
573 log << "Normalization based on standard deviation of reference data "
574 "selected:\n";
575 log << "\n";
576 log << " mean(e_ref) = 0, sigma(e_ref) = sigma(F_ref) = 1\n";
577 log << "\n";
578 meanEnergy = meanEnergyPerAtomRef;
579 convEnergy = 1.0 / sigmaEnergyPerAtomRef;
580 if (useForcesLocal) convLength = sigmaForceRef / sigmaEnergyPerAtomRef;
581 else convLength = 1.0;
582 normalize = true;
583 }
584 else if (settings["normalize_data_set"] == "force")
585 {
586 if (!useForcesLocal)
587 {
588 throw runtime_error("ERROR: Selected normalization mode only "
589 "possible when forces are available.\n");
590 }
591 log << "Normalization based on standard deviation of reference forces "
592 "and their\n";
593 log << "initial prediction selected:\n";
594 log << "\n";
595 log << " mean(e_ref) = 0, sigma(F_NNP) = sigma(F_ref) = 1\n";
596 log << "\n";
597 meanEnergy = meanEnergyPerAtomRef;
598 convEnergy = sigmaForceNnp / sigmaForceRef;
599 convLength = sigmaForceNnp;
600 normalize = true;
601 }
602 else
603 {
604 throw runtime_error("ERROR: Unknown data set normalization mode.\n");
605 }
606
607 if (settings["normalize_data_set"] != "stats-only")
608 {
609 log << "Final conversion data:\n";
610 log << strpr("Mean ref. energy per atom = %24.16E\n", meanEnergy);
611 log << strpr("Conversion factor energy = %24.16E\n", convEnergy);
612 log << strpr("Conversion factor length = %24.16E\n", convLength);
613 log << "----------------------------------\n";
614 }
615
616 if ((myRank == 0) &&
617 (settings["normalize_data_set"] != "stats-only"))
618 {
619 log << "\n";
620 log << "Writing backup of original settings file to "
621 "\"input.nn.bak\".\n";
622 ofstream fileSettings;
623 fileSettings.open("input.nn.bak");
624 writeSettingsFile(&fileSettings);
625 fileSettings.close();
626
627 log << "\n";
628 log << "Writing normalization data to settings file \"input.nn\".\n";
629 string n1 = strpr("mean_energy %24.16E # nnp-train\n",
630 meanEnergyPerAtomRef);
631 string n2 = strpr("conv_energy %24.16E # nnp-train\n",
632 convEnergy);
633 string n3 = strpr("conv_length %24.16E # nnp-train\n",
634 convLength);
635 // Check for existing normalization header and record line numbers
636 // to replace.
637 auto lines = settings.getSettingsLines();
638 map<size_t, string> replace;
639 for (size_t i = 0; i < lines.size(); ++i)
640 {
641 vector<string> sl = split(lines.at(i));
642 if (sl.size() > 0)
643 {
644 if (sl.at(0) == "mean_energy") replace[i] = n1;
645 if (sl.at(0) == "conv_energy") replace[i] = n2;
646 if (sl.at(0) == "conv_length") replace[i] = n3;
647 }
648 }
649 if (!replace.empty())
650 {
651 log << "WARNING: Preexisting normalization data was found and "
652 "replaced in original \"input.nn\" file.\n";
653 }
654
655 fileSettings.open("input.nn");
656 if (replace.empty())
657 {
658 fileSettings << "#########################################"
659 "######################################\n";
660 fileSettings << "# DATA SET NORMALIZATION\n";
661 fileSettings << "#########################################"
662 "######################################\n";
663 fileSettings << n1;
664 fileSettings << n2;
665 fileSettings << n3;
666 fileSettings << "#########################################"
667 "######################################\n";
668 fileSettings << "\n";
669 }
670 settings.writeSettingsFile(&fileSettings, replace);
671 fileSettings.close();
672 }
673
674 // Now make up for left-out normalization setup, need to repeat entire
675 // symmetry function setup.
676 log << "\n";
677 if (normalize)
678 {
679 log << "Silently repeating symmetry function setup...\n";
680 log.silent = true;
681 for (auto& e : elements) e.clearSymmetryFunctions();
683#ifndef N2P2_FULL_SFD_MEMORY
685#endif
686#ifndef N2P2_NO_SF_CACHE
688#endif
689#ifndef N2P2_NO_SF_GROUPS
691#endif
693 setupSymmetryFunctionStatistics(false, false, false, false);
694 log.silent = false;
695 }
696
697 log << "*****************************************"
698 "**************************************\n";
699
700 return;
701}
702
704{
705 log << "\n";
706 log << "*** SETUP: TRAINING *********************"
707 "**************************************\n";
708 log << "\n";
709
711 {
712 log << strpr("Running stage %zu of training:", stage);
713 if (stage == 1) log << "charge NN fitting.\n";
714 else if (stage == 2) log << "short-range NN fitting.\n";
715 else throw runtime_error("\nERROR: Unknown training stage.\n");
716 }
717
720 {
721 nnId = "short";
722 useForces = settings.keywordExists("use_short_forces");
723 if (useForces)
724 {
725 log << "Forces will be used for training.\n";
726 if (settings.keywordExists("force_weight"))
727 {
728 forceWeight = atof(settings["force_weight"].c_str());
729 }
730 else
731 {
732 log << "WARNING: Force weight not set, using default value.\n";
733 forceWeight = 1.0;
734 }
735 log << strpr("Force update weight: %10.2E\n", forceWeight);
736 }
737 else
738 {
739 log << "Only energies will used for training.\n";
740 }
741 }
742 else if (nnpType == NNPType::SHORT_CHARGE_NN && stage == 1)
743 {
744 nnId = "charge";
745 }
746 log << "Training will act on \"" << nnId << "\" neural networks.\n";
747
748 if (settings.keywordExists("main_error_metric"))
749 {
750 string k;
751 if (settings["main_error_metric"] == "RMSEpa")
752 {
753 k = "energy"; if (p.exists(k)) p[k].displayMetric = "RMSEpa";
754 k = "force"; if (p.exists(k)) p[k].displayMetric = "RMSE";
755 k = "charge"; if (p.exists(k)) p[k].displayMetric = "RMSE";
756 }
757 else if (settings["main_error_metric"] == "RMSE")
758 {
759 k = "energy"; if (p.exists(k)) p[k].displayMetric = "RMSE";
760 k = "force"; if (p.exists(k)) p[k].displayMetric = "RMSE";
761 k = "charge"; if (p.exists(k)) p[k].displayMetric = "RMSE";
762 }
763 else if (settings["main_error_metric"] == "MAEpa")
764 {
765 k = "energy"; if (p.exists(k)) p[k].displayMetric = "MAEpa";
766 k = "force"; if (p.exists(k)) p[k].displayMetric = "MAE";
767 k = "charge"; if (p.exists(k)) p[k].displayMetric = "MAE";
768 }
769 else if (settings["main_error_metric"] == "MAE")
770 {
771 k = "energy"; if (p.exists(k)) p[k].displayMetric = "MAE";
772 k = "force"; if (p.exists(k)) p[k].displayMetric = "MAE";
773 k = "charge"; if (p.exists(k)) p[k].displayMetric = "MAE";
774 }
775 else
776 {
777 throw runtime_error("ERROR: Unknown error metric.\n");
778 }
779 }
780 else
781 {
782 string k;
783 k = "energy"; if (p.exists(k)) p[k].displayMetric = "RMSEpa";
784 k = "force"; if (p.exists(k)) p[k].displayMetric = "RMSE";
785 k = "charge"; if (p.exists(k)) p[k].displayMetric = "RMSE";
786 }
787
788 updaterType = (UpdaterType)atoi(settings["updater_type"].c_str());
789 if (updaterType == UT_GD)
790 {
791 log << strpr("Weight update via gradient descent selected: "
792 "updaterType::UT_GD (%d)\n",
794 }
795 else if (updaterType == UT_KF)
796 {
797 log << strpr("Weight update via Kalman filter selected: "
798 "updaterType::UT_KF (%d)\n",
800 }
801 else if (updaterType == UT_LM)
802 {
803 throw runtime_error("ERROR: LM algorithm not yet implemented.\n");
804 log << strpr("Weight update via Levenberg-Marquardt algorithm "
805 "selected: updaterType::UT_LM (%d)\n",
807 }
808 else
809 {
810 throw runtime_error("ERROR: Unknown updater type.\n");
811 }
812
813 parallelMode = (ParallelMode)atoi(settings["parallel_mode"].c_str());
814 //if (parallelMode == PM_DATASET)
815 //{
816 // log << strpr("Serial training selected: "
817 // "ParallelMode::PM_DATASET (%d)\n",
818 // parallelMode);
819 //}
821 {
822 log << strpr("Parallel training (rank 0 updates) selected: "
823 "ParallelMode::PM_TRAIN_RK0 (%d)\n",
825 }
826 else if (parallelMode == PM_TRAIN_ALL)
827 {
828 log << strpr("Parallel training (all ranks update) selected: "
829 "ParallelMode::PM_TRAIN_ALL (%d)\n",
831 }
832 else
833 {
834 throw runtime_error("ERROR: Unknown parallelization mode.\n");
835 }
836
837 jacobianMode = (JacobianMode)atoi(settings["jacobian_mode"].c_str());
838 if (jacobianMode == JM_SUM)
839 {
840 log << strpr("Gradient summation only selected: "
841 "JacobianMode::JM_SUM (%d)\n", jacobianMode);
842 log << "No Jacobi matrix, gradients of all training candidates are "
843 "summed up instead.\n";
844 }
845 else if (jacobianMode == JM_TASK)
846 {
847 log << strpr("Per-task Jacobian selected: "
848 "JacobianMode::JM_TASK (%d)\n",
850 log << "One Jacobi matrix row per MPI task is stored, within each "
851 "task gradients are summed up.\n";
852 }
853 else if (jacobianMode == JM_FULL)
854 {
855 log << strpr("Full Jacobian selected: "
856 "JacobianMode::JM_FULL (%d)\n",
858 log << "Each update candidate generates one Jacobi matrix "
859 "row entry.\n";
860 }
861 else
862 {
863 throw runtime_error("ERROR: Unknown Jacobian mode.\n");
864 }
865
867 {
868 throw runtime_error("ERROR: Gradient descent methods can only be "
869 "combined with Jacobian mode JM_SUM.\n");
870 }
871
872 updateStrategy = (UpdateStrategy)atoi(settings["update_strategy"].c_str());
873 // This section is pushed into a separate function because it's needed also
874 // for testing purposes.
876 // Now it is possible to fill the weights arrays with weight parameters
877 // from the neural network.
878 getWeights();
879
880 // Set up update candidate selection modes.
881 setupSelectionMode("all");
882 for (auto k : pk) setupSelectionMode(k);
883
884 log << "-----------------------------------------"
885 "--------------------------------------\n";
886 repeatedEnergyUpdates = settings.keywordExists("repeated_energy_update");
888 {
889 throw runtime_error("ERROR: Repeated energy updates are not correctly"
890 " implemented at the moment.\n");
891 //log << "After each force update an energy update for the\n";
892 //log << "corresponding structure will be performed.\n";
893 }
894
895 freeMemory = !(settings.keywordExists("memorize_symfunc_results"));
896 if (freeMemory)
897 {
898 log << "Symmetry function memory is cleared after each calculation.\n";
899 }
900 else
901 {
902 log << "Symmetry function memory is reused (HIGH MEMORY USAGE!).\n";
903 }
904
905 numEpochs = (size_t)atoi(settings["epochs"].c_str());
906 log << strpr("Training will be stopped after %zu epochs.\n", numEpochs);
907
908 // Set up how often comparison output files should be written.
909 for (auto k : pk) setupFileOutput(k);
910 // Set up how often weight files should be written.
911 setupFileOutput("weights_epoch");
912 // Set up how often neuron statistics files should be written.
913 setupFileOutput("neuronstats");
914
915 // Prepare training log header.
916 writeTrainingLog = settings.keywordExists("write_trainlog");
917 if (writeTrainingLog && myRank == 0)
918 {
920 {
921 trainingLogFileName += strpr(".stage-%zu", stage);
922 }
923 log << strpr("Training log with update information will be written to:"
924 " %s.\n", trainingLogFileName.c_str());
925 trainingLog.open(trainingLogFileName.c_str());
926
927 // File header.
928 vector<string> title;
929 vector<string> colName;
930 vector<string> colInfo;
931 vector<size_t> colSize;
932 title.push_back("Detailed information on each weight update.");
933 colSize.push_back(3);
934 colName.push_back("U");
935 colInfo.push_back("Update type (E = energy, F = force, Q = charge).");
936 colSize.push_back(5);
937 colName.push_back("epoch");
938 colInfo.push_back("Current training epoch.");
939 colSize.push_back(10);
940 colName.push_back("count");
941 colInfo.push_back("Update counter (Multiple lines with identical count"
942 " for multi-streaming!).");
943 colSize.push_back(5);
944 colName.push_back("proc");
945 colInfo.push_back("MPI process providing this update candidate.");
946 colSize.push_back(3);
947 colName.push_back("tl");
948 colInfo.push_back("Threshold loop counter.");
949 colSize.push_back(10);
950 colName.push_back("rmse_frac");
951 colInfo.push_back("Update candidates error divided by this "
952 "epochs RMSE.");
953 colSize.push_back(10);
954 colName.push_back("s_ind_g");
955 colInfo.push_back("Global structure index.");
956 colSize.push_back(5);
957 colName.push_back("s_ind");
958 colInfo.push_back("Local structure index on this MPI process.");
959 colSize.push_back(5);
960 colName.push_back("a_ind");
961 colInfo.push_back("Atom index.");
962 colSize.push_back(2);
963 colName.push_back("c");
964 colInfo.push_back("Force component (0 = x, 1 = y, 2 = z).");
966 createFileHeader(title, colSize, colName, colInfo));
967 }
968
969 // Compute number of updates and properties per update.
970 log << "-----------------------------------------"
971 "--------------------------------------\n";
972 for (auto k : pk) setupUpdatePlan(k);
973 if (p.exists("energy") && p.exists("force"))
974 {
975 Property& pe = p["energy"];
976 Property& pf = p["force"];
977 log << strpr("Energy to force ratio : "
978 " 1 : %5.1f\n",
979 static_cast<double>(
982 log << strpr("Energy to force percentages : "
983 "%5.1f%% : %5.1f%%\n",
984 pe.numUpdates * pe.patternsPerUpdateGlobal * 100.0 /
987 pf.numUpdates * pf.patternsPerUpdateGlobal * 100.0 /
990 }
991 double totalUpdates = 0.0;
992 for (auto k : pk) totalUpdates += p[k].numUpdates;
993 log << "-----------------------------------------"
994 "--------------------------------------\n";
995
996 // Allocate error and Jacobian arrays.
997 for (auto k : pk) allocateArrays(k);
998 log << "-----------------------------------------"
999 "--------------------------------------\n";
1000
1001 // Set up new C++11 random number generator (TODO: move it!).
1002 rngGlobalNew.seed(gsl_rng_get(rngGlobal));
1003 rngNew.seed(gsl_rng_get(rng));
1004
1005 // Updater setup.
1007 if (updaterType == UT_GD)
1008 {
1009 descentType = (GradientDescent::DescentType)
1010 atoi(settings["gradient_type"].c_str());
1011 }
1013 if (updaterType == UT_KF)
1014 {
1015 kalmanType = (KalmanFilter::KalmanType)
1016 atoi(settings["kalman_type"].c_str());
1017 }
1018
1019 for (size_t i = 0; i < numUpdaters; ++i)
1020 {
1021 if ( (myRank == 0) || (parallelMode == PM_TRAIN_ALL) )
1022 {
1023 if (updaterType == UT_GD)
1024 {
1025 updaters.push_back(
1027 descentType));
1028 }
1029 else if (updaterType == UT_KF)
1030 {
1031 updaters.push_back(
1033 kalmanType));
1034 }
1035 updaters.back()->setState(&(weights.at(i).front()));
1036 updaters.back()->setupTiming(strpr("wupd%zu", i));
1037 updaters.back()->resetTimingLoop();
1038 }
1039 }
1040 if (updaters.size() > 0) hasUpdaters = true;
1041 else hasUpdaters = false;
1042
1043 if (hasUpdaters && updaterType == UT_GD)
1044 {
1045 if (descentType == GradientDescent::DT_FIXED)
1046 {
1047 double const eta = atof(settings["gradient_eta"].c_str());
1048 for (size_t i = 0; i < numUpdaters; ++i)
1049 {
1050 GradientDescent* u =
1051 dynamic_cast<GradientDescent*>(updaters.at(i));
1052 u->setParametersFixed(eta);
1053 }
1054 }
1055 if (descentType == GradientDescent::DT_ADAM)
1056 {
1057 double const eta = atof(settings["gradient_adam_eta"].c_str());
1058 double const beta1 = atof(settings["gradient_adam_beta1"].c_str());
1059 double const beta2 = atof(settings["gradient_adam_beta2"].c_str());
1060 double const eps = atof(settings["gradient_adam_epsilon"].c_str());
1061 for (size_t i = 0; i < numUpdaters; ++i)
1062 {
1063 GradientDescent* u =
1064 dynamic_cast<GradientDescent*>(updaters.at(i));
1065 u->setParametersAdam(eta, beta1, beta2, eps);
1066 }
1067 }
1068 }
1069 else if (hasUpdaters && updaterType == UT_KF)
1070 {
1071 if (kalmanType == KalmanFilter::KT_STANDARD)
1072 {
1073 double const epsilon = atof(settings["kalman_epsilon"].c_str());
1074 double const q0 = atof(settings["kalman_q0" ].c_str());
1075 double const qtau = atof(settings["kalman_qtau" ].c_str())
1076 / totalUpdates;
1077 log << "qtau is divided by number "
1078 "of projected updates per epoch.\n";
1079 double const qmin = atof(settings["kalman_qmin" ].c_str());
1080 double const eta0 = atof(settings["kalman_eta" ].c_str());
1081 double etatau = 1.0;
1082 double etamax = eta0;
1083 if (settings.keywordExists("kalman_etatau") &&
1084 settings.keywordExists("kalman_etamax"))
1085 {
1086 etatau = atof(settings["kalman_etatau"].c_str())
1087 / totalUpdates;
1088 log << "etatau is divided by number "
1089 "of projected updates per epoch.\n";
1090 etamax = atof(settings["kalman_etamax"].c_str());
1091 }
1092 for (size_t i = 0; i < updaters.size(); ++i)
1093 {
1094 KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.at(i));
1095 u->setParametersStandard(epsilon,
1096 q0,
1097 qtau,
1098 qmin,
1099 eta0,
1100 etatau,
1101 etamax);
1102 }
1103 }
1104 else if (kalmanType == KalmanFilter::KT_FADINGMEMORY)
1105 {
1106 double const epsilon = atof(settings["kalman_epsilon"].c_str());
1107 double const q0 = atof(settings["kalman_q0" ].c_str());
1108 double const qtau = atof(settings["kalman_qtau" ].c_str())
1109 / totalUpdates;
1110 log << "qtau is divided by number "
1111 "of projected updates per epoch.\n";
1112 double const qmin = atof(settings["kalman_qmin"].c_str());
1113 double const lambda =
1114 atof(settings["kalman_lambda_short"].c_str());
1115 //double const nu =
1116 // pow(atof(settings["kalman_nue_short"].c_str()), numProcs);
1117 //log << "nu is exponentiated with the number of streams.\n";
1118 double const nu = atof(settings["kalman_nue_short"].c_str());
1119 for (size_t i = 0; i < updaters.size(); ++i)
1120 {
1121 KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.at(i));
1122 u->setParametersFadingMemory(epsilon,
1123 q0,
1124 qtau,
1125 qmin,
1126 lambda,
1127 nu);
1128 }
1129 }
1130 }
1131
1132 log << "-----------------------------------------"
1133 "--------------------------------------\n";
1134 for (size_t i = 0; i < updaters.size(); ++i)
1135 {
1137 {
1138 log << strpr("Combined weight updater:\n");
1139 }
1140 else if (updateStrategy == US_ELEMENT)
1141 {
1142 log << strpr("Weight updater for element %2s :\n",
1143 elements.at(i).getSymbol().c_str());
1144 }
1145 log << "-----------------------------------------"
1146 "--------------------------------------\n";
1147 log << updaters.at(i)->info();
1148 if (updaterType == UT_KF)
1149 {
1150 log << "Note: During training loop the actual observation\n";
1151 log << " size corresponds to error vector size:\n";
1152 for (auto k : pk)
1153 {
1154 log << strpr("sizeObservation = %zu (%s updates)\n",
1155 p[k].error.at(i).size(), k.c_str());
1156 }
1157 }
1158 log << "-----------------------------------------"
1159 "--------------------------------------\n";
1160 }
1161
1162 log << strpr("TIMING Finished setup: %.2f seconds.\n",
1163 sw["setup"].stop());
1164 log << "*****************************************"
1165 "**************************************\n";
1166
1167 return;
1168}
1169
1171{
1172 log << "\n";
1173 log << "*** SETUP WEIGHT DERIVATIVES CHECK ******"
1174 "**************************************\n";
1175 log << "\n";
1176
1177 log << "Weight derivatives will be checked for these properties:\n";
1178 for (auto k : pk) log << " - " + p[k].plural + "\n";
1179 log << "\n";
1180
1182 {
1183 nnId = "short";
1184 readNeuralNetworkWeights(nnId, "weights.%03zu.data");
1185 }
1186 else if (nnpType == NNPType::SHORT_CHARGE_NN && stage == 1)
1187 {
1188 nnId = "charge";
1189 readNeuralNetworkWeights(nnId, "weightse.%03zu.data");
1190 }
1191 else if (nnpType == NNPType::SHORT_CHARGE_NN && stage == 2)
1192 {
1193 nnId = "short";
1194 readNeuralNetworkWeights("charge", "weightse.%03zu.data");
1195 readNeuralNetworkWeights(nnId, "weights.%03zu.data");
1196 }
1198 getWeights();
1199
1200 log << "*****************************************"
1201 "**************************************\n";
1202
1203 return pk;
1204}
1205
1207{
1208 sw["nl"].start();
1209 log << "\n";
1210 log << "*** CALCULATE NEIGHBOR LISTS ************"
1211 "**************************************\n";
1212 log << "\n";
1213
1214#ifdef _OPENMP
1215 int num_threads = omp_get_max_threads();
1216 omp_set_num_threads(1);
1217 log << strpr("Temporarily disabling OpenMP parallelization: %d threads.\n",
1218 omp_get_max_threads());
1219#endif
1220 log << "Calculating neighbor lists for all structures.\n";
1221 double maxCutoffRadiusPhys = maxCutoffRadius;
1222 if (normalize) maxCutoffRadiusPhys = maxCutoffRadius / convLength;
1223 log << strpr("Cutoff radius for neighbor lists: %f\n",
1224 maxCutoffRadiusPhys);
1225 for (vector<Structure>::iterator it = structures.begin();
1226 it != structures.end(); ++it)
1227 {
1228 it->calculateNeighborList(maxCutoffRadius);
1229 }
1230#ifdef _OPENMP
1231 omp_set_num_threads(num_threads);
1232 log << strpr("Restoring OpenMP parallelization: max. %d threads.\n",
1233 omp_get_max_threads());
1234#endif
1235
1236 log << "-----------------------------------------"
1237 "--------------------------------------\n";
1238 log << strpr("TIMING Finished neighbor lists: %.2f seconds.\n",
1239 sw["nl"].stop());
1240 log << "*****************************************"
1241 "**************************************\n";
1242
1243 return;
1244}
1245
1247 map<string, pair<string, string>> const fileNames)
1248{
1249#ifdef _OPENMP
1250 int num_threads = omp_get_max_threads();
1251 omp_set_num_threads(1);
1252#endif
1253 vector<string> write;
1254 for (auto i : fileNames)
1255 {
1256 if (i.second.first.size() == 0 || i.second.second.size() == 0)
1257 {
1258 throw runtime_error("ERROR: No filename provided for comparison "
1259 "files.\n");
1260 }
1261 write.push_back(i.first);
1262 }
1263 auto doWrite = [&write](string key){
1264 return find(write.begin(),
1265 write.end(),
1266 key) != write.end();
1267 };
1268
1269
1270 map<string, size_t> countTrain;
1271 map<string, size_t> countTest;
1272 for (auto k : pk) countTrain[k] = 0;
1273 for (auto k : pk) countTest[k] = 0;
1274
1275 map<string, ofstream> filesTrain;
1276 map<string, ofstream> filesTest;
1277
1278 // Reset current error metrics.
1279 for (auto k : pk)
1280 {
1281 for (auto& m : p[k].errorTrain) m.second = 0.0;
1282 for (auto& m : p[k].errorTest) m.second = 0.0;
1283 }
1284
1285 for (auto k : write)
1286 {
1287 filesTrain[k].open(strpr("%s.%04d",
1288 fileNames.at(k).first.c_str(),
1289 myRank).c_str());
1290 filesTest[k].open(strpr("%s.%04d",
1291 fileNames.at(k).second.c_str(),
1292 myRank).c_str());
1293 // File header.
1294 vector<string> header;
1295 if (myRank == 0)
1296 {
1297 vector<string> title;
1298 vector<string> colName;
1299 vector<string> colInfo;
1300 vector<size_t> colSize;
1301 if (k == "energy")
1302 {
1303 title.push_back("Energy comparison.");
1304 colSize.push_back(10);
1305 colName.push_back("index");
1306 colInfo.push_back("Structure index.");
1307 colSize.push_back(16);
1308 colName.push_back("Eref");
1309 colInfo.push_back("Reference potential energy per atom "
1310 "(training units).");
1311 colSize.push_back(16);
1312 colName.push_back("Ennp");
1313 colInfo.push_back("NNP potential energy per atom "
1314 "(training units).");
1315 }
1316 else if (k == "force")
1317 {
1318 title.push_back("Force comparison.");
1319 colSize.push_back(10);
1320 colName.push_back("index_s");
1321 colInfo.push_back("Structure index.");
1322 colSize.push_back(10);
1323 colName.push_back("index_a");
1324 colInfo.push_back("Atom index (x, y, z components in "
1325 "consecutive lines).");
1326 colSize.push_back(16);
1327 colName.push_back("Fref");
1328 colInfo.push_back("Reference force (training units).");
1329 colSize.push_back(16);
1330 colName.push_back("Fnnp");
1331 colInfo.push_back("NNP force (training units).");
1332 }
1333 else if (k == "charge")
1334 {
1335 title.push_back("Charge comparison.");
1336 colSize.push_back(10);
1337 colName.push_back("index_s");
1338 colInfo.push_back("Structure index.");
1339 colSize.push_back(10);
1340 colName.push_back("index_a");
1341 colInfo.push_back("Atom index.");
1342 colSize.push_back(16);
1343 colName.push_back("Qref");
1344 colInfo.push_back("Reference charge.");
1345 colSize.push_back(16);
1346 colName.push_back("Qnnp");
1347 colInfo.push_back("NNP charge.");
1348 }
1349 header = createFileHeader(title, colSize, colName, colInfo);
1350 appendLinesToFile(filesTrain.at(k), header);
1351 appendLinesToFile(filesTest.at(k), header);
1352 }
1353 }
1354
1355 for (vector<Structure>::iterator it = structures.begin();
1356 it != structures.end(); ++it)
1357 {
1358#ifdef N2P2_NO_SF_GROUPS
1360#else
1362#endif
1364 calculateEnergy((*it));
1365 if (useForces) calculateForces((*it));
1366 for (auto k : pk)
1367 {
1368 map<string, double>* error = nullptr;
1369 size_t* count = nullptr;
1370 ofstream* file = nullptr;
1371 if (it->sampleType == Structure::ST_TRAINING)
1372 {
1373 error = &(p[k].errorTrain);
1374 count = &(countTrain.at(k));
1375 if (doWrite(k)) file = &(filesTrain.at(k));
1376 }
1377 else if (it->sampleType == Structure::ST_TEST)
1378 {
1379 error = &(p[k].errorTest);
1380 count = &(countTest.at(k));
1381 if (doWrite(k)) file = &(filesTest.at(k));
1382 }
1383
1384 it->updateError(k, *error, *count);
1385 if (doWrite(k))
1386 {
1387 if (k == "energy") (*file) << it->getEnergyLine();
1388 else if (k == "force")
1389 {
1390 for (auto l : it->getForcesLines()) (*file) << l;
1391 }
1392 else if (k == "charge")
1393 {
1394 for (auto l : it->getChargesLines()) (*file) << l;
1395 }
1396 }
1397 }
1398 if (freeMemory) it->freeAtoms(true);
1399 }
1400
1401 for (auto k : pk)
1402 {
1403 collectError(k, p[k].errorTrain, countTrain.at(k));
1404 collectError(k, p[k].errorTest, countTest.at(k));
1405 if (doWrite(k))
1406 {
1407 filesTrain.at(k).close();
1408 filesTest.at(k).close();
1409 MPI_Barrier(comm);
1410 if (myRank == 0)
1411 {
1412 combineFiles(fileNames.at(k).first);
1413 combineFiles(fileNames.at(k).second);
1414 }
1415 }
1416 }
1417
1418#ifdef _OPENMP
1419 omp_set_num_threads(num_threads);
1420#endif
1421
1422 return;
1423}
1424
1426{
1427 // Check whether property comparison files should be written for
1428 // this epoch.
1429 map<string, pair<string, string>> fileNames;
1430
1431 for (auto const& ip : p)
1432 {
1433 string const& k = ip.first; // key
1434 Property const& d = ip.second; // data
1435 if (d.writeCompEvery > 0 &&
1436 (epoch % d.writeCompEvery == 0 || epoch <= d.writeCompAlways))
1437 {
1438 string middle;
1439 if (k == "energy") middle = "points";
1440 else if (k == "force" ) middle = "forces";
1441 else if (k == "charge") middle = "charges";
1442 fileNames[k] = make_pair(strpr("train%s.%06zu.out",
1443 middle.c_str(), epoch),
1444 strpr("test%s.%06zu.out",
1445 middle.c_str(), epoch));
1446 }
1447 }
1448
1449 // Calculate errors and write comparison files.
1450 calculateError(fileNames);
1451
1452 return;
1453}
1454
1456{
1457 string metric = "?";
1458 string peratom = "";
1459
1460 log << "The training loop output covers different errors, update and\n";
1461 log << "timing information. The following quantities are organized\n";
1462 log << "according to the matrix scheme below:\n";
1463 log << "-------------------------------------------------------------------\n";
1464 log << "ep ........ Epoch.\n";
1465 for (auto k : pk)
1466 {
1467 string const& pmetric = p[k].displayMetric;
1468 if (pmetric.find("RMSE") != pmetric.npos) metric = "RMSE";
1469 else if (pmetric.find("MAE") != pmetric.npos) metric = "MAE";
1470 if (pmetric.find("pa") != pmetric.npos) peratom = " per atom";
1471 else peratom = "";
1472 log << p[k].tiny << "_count ... Number of " << k << " updates.\n";
1473 log << p[k].tiny << "_train ... " << metric << " of training "
1474 << p[k].plural << peratom << ".\n";
1475 log << p[k].tiny << "_test .... " << metric << " of test "
1476 << p[k].plural << peratom << ".\n";
1477 //log << p[k].tiny << "_time ........ Time for " << k << " updates "
1478 // << "(seconds).\n";
1479 log << p[k].tiny << "_pt ...... Percentage of time for " << k <<
1480 " updates w.r.t. to t_train.\n";
1481 }
1482 log << "count ..... Total number of updates.\n";
1483 log << "train ..... Percentage of time for training.\n";
1484 log << "error ..... Percentage of time for error calculation.\n";
1485 log << "other ..... Percentage of time for other purposes.\n";
1486 log << "epoch ..... Total time for this epoch (seconds).\n";
1487 log << "total ..... Total time for all epochs (seconds).\n";
1488 log << "-------------------------------------------------------------------\n";
1489 for (auto k : pk)
1490 {
1491 log << strpr("%-6s", k.c_str())
1492 << strpr(" %5s", "ep")
1493 << strpr(" %7s", (p[k].tiny + "_count").c_str())
1494 << strpr(" %11s", (p[k].tiny + "_train").c_str())
1495 << strpr(" %11s", (p[k].tiny + "_test").c_str())
1496 << strpr(" %5s", (p[k].tiny + "_pt").c_str())
1497 << "\n";
1498 }
1499 log << strpr("%-6s", "timing")
1500 << strpr(" %5s", "ep")
1501 << strpr(" %7s", "count")
1502 << strpr(" %5s", "train")
1503 << strpr(" %5s", "error")
1504 << strpr(" %5s", "other")
1505 << strpr(" %9s", "epoch")
1506 << strpr(" %9s", "total")
1507 << "\n";
1508 log << "-------------------------------------------------------------------\n";
1509
1510 return;
1511}
1512
1514{
1515 double timeLoop = sw["loop"].getLoop();
1516 double timeTrain = sw["train"].getLoop();
1517 size_t totalUpdates = 0;
1518 for (auto k : pk)
1519 {
1520 totalUpdates += p[k].countUpdates;
1521 double timeProp = sw[k].getLoop();
1522 string caps = k;
1523 for (auto& c : caps) c = toupper(c);
1524 log << strpr("%-6s", caps.c_str());
1525 log << strpr(" %5zu", epoch);
1526 log << strpr(" %7zu", p[k].countUpdates);
1527 if (normalize && (k != "charge"))
1528 {
1529 log << strpr(" %11.5E %11.5E",
1530 physical(k, p[k].errorTrain.at(p[k].displayMetric)),
1531 physical(k, p[k].errorTest.at(p[k].displayMetric)));
1532 }
1533 else
1534 {
1535 log << strpr(" %11.5E %11.5E",
1536 p[k].errorTrain.at(p[k].displayMetric),
1537 p[k].errorTest.at(p[k].displayMetric));
1538 }
1539 if (epoch == 0) log << strpr(" %5.1f", 0.0);
1540 else log << strpr(" %5.1f", timeProp / timeTrain * 100.0);
1541 log << "\n";
1542 }
1543 double timeOther = timeLoop;
1544 timeOther -= sw["error"].getLoop();
1545 timeOther -= sw["train"].getLoop();
1546 log << strpr("%-6s", "TIMING");
1547 log << strpr(" %5zu", epoch);
1548 log << strpr(" %7zu", totalUpdates);
1549 log << strpr(" %5.1f", sw["train"].getLoop() / timeLoop * 100.0);
1550 log << strpr(" %5.1f", sw["error"].getLoop() / timeLoop * 100.0);
1551 log << strpr(" %5.1f", timeOther / timeLoop * 100.0);
1552 log << strpr(" %9.2f", sw["loop"].getLoop());
1553 log << strpr(" %9.2f", sw["loop"].getTotal());
1554 log << "\n";
1555
1556 return;
1557}
1558
1559void Training::writeWeights(string const& nnName,
1560 string const& fileNameFormat) const
1561{
1562 ofstream file;
1563
1564 for (size_t i = 0; i < numElements; ++i)
1565 {
1566 string fileName = strpr(fileNameFormat.c_str(),
1567 elements.at(i).getAtomicNumber());
1568 file.open(fileName.c_str());
1569 elements.at(i).neuralNetworks.at(nnName).writeConnections(file);
1570 file.close();
1571 }
1572
1573 return;
1574}
1575
1577{
1578 if (writeWeightsEvery > 0 &&
1580 {
1583 {
1584 writeWeights("short", strpr("weights.%%03zu.%06d.out", epoch));
1585 }
1586 else if (nnpType == NNPType::SHORT_CHARGE_NN && stage == 1)
1587 {
1588 writeWeights("charge", strpr("weightse.%%03zu.%06d.out", epoch));
1589 }
1590 }
1591
1592 return;
1593}
1594
1595void Training::writeLearningCurve(bool append, string const fileName) const
1596{
1597 ofstream file;
1598 string fileNameActual = fileName;
1600 {
1601 fileNameActual += strpr(".stage-%zu", stage);
1602 }
1603
1604 if (append) file.open(fileNameActual.c_str(), ofstream::app);
1605 else
1606 {
1607 file.open(fileNameActual.c_str());
1608
1609 // File header.
1610 vector<string> title;
1611 vector<string> colName;
1612 vector<string> colInfo;
1613 vector<size_t> colSize;
1616 {
1617 title.push_back("Learning curves for energies and forces.");
1618 }
1619 else if (nnpType == NNPType::SHORT_CHARGE_NN && stage == 1)
1620 {
1621 title.push_back("Learning curves for charges.");
1622 }
1623 colSize.push_back(10);
1624 colName.push_back("epoch");
1625 colInfo.push_back("Current epoch.");
1626
1627 map<string, string> text;
1628 text["RMSEpa"] = "RMSE of %s %s per atom";
1629 text["RMSE"] = "RMSE of %s %s";
1630 text["MAEpa"] = "MAE of %s %s per atom";
1631 text["MAE"] = "MAE of %s %s";
1632
1633 for (auto k : pk)
1634 {
1635 for (auto m : p[k].errorMetrics)
1636 {
1637 colSize.push_back(16);
1638 colName.push_back(m + "_" + p[k].tiny + "train_pu");
1639 colInfo.push_back(strpr(
1640 (text[m] + " (physical units)").c_str(),
1641 "training",
1642 p[k].plural.c_str()));
1643 colSize.push_back(16);
1644 colName.push_back(m + "_" + p[k].tiny + "test_pu");
1645 colInfo.push_back(strpr(
1646 (text[m] + " (physical units)").c_str(),
1647 "test",
1648 p[k].plural.c_str()));
1649 }
1650 }
1651 if (normalize)
1652 {
1653 for (auto k : pk)
1654 {
1655 // Internal units only for energies and forces.
1656 if (!(k == "energy" || k == "force")) continue;
1657 for (auto m : p[k].errorMetrics)
1658 {
1659 colSize.push_back(16);
1660 colName.push_back(m + "_" + p[k].tiny + "train_iu");
1661 colInfo.push_back(strpr(
1662 (text[m] + " (training units)").c_str(),
1663 "training",
1664 p[k].plural.c_str()));
1665 colSize.push_back(16);
1666 colName.push_back(m + "_" + p[k].tiny + "test_iu");
1667 colInfo.push_back(strpr(
1668 (text[m] + " (training units)").c_str(),
1669 "test",
1670 p[k].plural.c_str()));
1671 }
1672 }
1673 }
1674 appendLinesToFile(file,
1675 createFileHeader(title, colSize, colName, colInfo));
1676 }
1677
1678 file << strpr("%10zu", epoch);
1679 if (normalize)
1680 {
1681 for (auto k : pk)
1682 {
1683 if (!(k == "energy" || k == "force")) continue;
1684 for (auto m : p[k].errorMetrics)
1685 {
1686 file << strpr(" %16.8E %16.8E",
1687 physical(k, p[k].errorTrain.at(m)),
1688 physical(k, p[k].errorTest.at(m)));
1689 }
1690 }
1691 }
1692 for (auto k : pk)
1693 {
1694 for (auto m : p[k].errorMetrics)
1695 {
1696 file << strpr(" %16.8E %16.8E",
1697 p[k].errorTrain.at(m),
1698 p[k].errorTest.at(m));
1699 }
1700 }
1701 file << "\n";
1702 file.flush();
1703 file.close();
1704
1705 return;
1706}
1707
1708void Training::writeNeuronStatistics(string const& nnName,
1709 string const& fileName) const
1710{
1711 ofstream file;
1712 if (myRank == 0)
1713 {
1714 file.open(fileName.c_str());
1715
1716 // File header.
1717 vector<string> title;
1718 vector<string> colName;
1719 vector<string> colInfo;
1720 vector<size_t> colSize;
1721 title.push_back("Statistics for individual neurons of network \""
1722 + nnName + "\" gathered during RMSE calculation.");
1723 colSize.push_back(10);
1724 colName.push_back("element");
1725 colInfo.push_back("Element index.");
1726 colSize.push_back(10);
1727 colName.push_back("neuron");
1728 colInfo.push_back("Neuron number.");
1729 colSize.push_back(10);
1730 colName.push_back("count");
1731 colInfo.push_back("Number of neuron value computations.");
1732 colSize.push_back(16);
1733 colName.push_back("min");
1734 colInfo.push_back("Minimum neuron value encounterd.");
1735 colSize.push_back(16);
1736 colName.push_back("max");
1737 colInfo.push_back("Maximum neuron value encounterd.");
1738 colSize.push_back(16);
1739 colName.push_back("mean");
1740 colInfo.push_back("Mean neuron value.");
1741 colSize.push_back(16);
1742 colName.push_back("sigma");
1743 colInfo.push_back("Standard deviation of neuron value.");
1744 appendLinesToFile(file,
1745 createFileHeader(title, colSize, colName, colInfo));
1746 }
1747
1748 for (size_t i = 0; i < numElements; ++i)
1749 {
1750 size_t n = elements.at(i).neuralNetworks.at(nnName).getNumNeurons();
1751 vector<long> count(n, 0);
1752 vector<double> min(n, 0.0);
1753 vector<double> max(n, 0.0);
1754 vector<double> mean(n, 0.0);
1755 vector<double> sigma(n, 0.0);
1756 elements.at(i).neuralNetworks.at(nnName).
1757 getNeuronStatistics(&(count.front()),
1758 &(min.front()),
1759 &(max.front()),
1760 &(mean.front()),
1761 &(sigma.front()));
1762 // Collect statistics from all processors on proc 0.
1763 if (myRank == 0)
1764 {
1765 MPI_Reduce(MPI_IN_PLACE, &(count.front()), n, MPI_LONG , MPI_SUM, 0, comm);
1766 MPI_Reduce(MPI_IN_PLACE, &(min.front()) , n, MPI_DOUBLE, MPI_MIN, 0, comm);
1767 MPI_Reduce(MPI_IN_PLACE, &(max.front()) , n, MPI_DOUBLE, MPI_MAX, 0, comm);
1768 MPI_Reduce(MPI_IN_PLACE, &(mean.front()) , n, MPI_DOUBLE, MPI_SUM, 0, comm);
1769 MPI_Reduce(MPI_IN_PLACE, &(sigma.front()), n, MPI_DOUBLE, MPI_SUM, 0, comm);
1770 }
1771 else
1772 {
1773 MPI_Reduce(&(count.front()), &(count.front()), n, MPI_LONG , MPI_SUM, 0, comm);
1774 MPI_Reduce(&(min.front()) , &(min.front()) , n, MPI_DOUBLE, MPI_MIN, 0, comm);
1775 MPI_Reduce(&(max.front()) , &(max.front()) , n, MPI_DOUBLE, MPI_MAX, 0, comm);
1776 MPI_Reduce(&(mean.front()) , &(mean.front()) , n, MPI_DOUBLE, MPI_SUM, 0, comm);
1777 MPI_Reduce(&(sigma.front()), &(sigma.front()), n, MPI_DOUBLE, MPI_SUM, 0, comm);
1778 }
1779 if (myRank == 0)
1780 {
1781 for (size_t j = 0; j < n; ++j)
1782 {
1783 size_t m = count.at(j);
1784 sigma.at(j) = sqrt((m * sigma.at(j) - mean.at(j) * mean.at(j))
1785 / (m * (m - 1)));
1786 mean.at(j) /= m;
1787 file << strpr("%10d %10d %10d %16.8E %16.8E %16.8E %16.8E\n",
1788 i + 1,
1789 j + 1,
1790 count[j],
1791 min[j],
1792 max[j],
1793 mean[j],
1794 sigma[j]);
1795 }
1796 }
1797 }
1798
1799 if (myRank == 0)
1800 {
1801 file.close();
1802 }
1803
1804 return;
1805}
1806
1808{
1809 vector<pair<string, string>> nnInfo; // NN name and file name.
1810
1814 {
1816 {
1817 nnInfo.push_back(
1818 make_pair("short", strpr("neuron-stats.%06zu.out", epoch)));
1819 }
1821 {
1822 if (stage == 1)
1823 {
1824 nnInfo.push_back(
1825 make_pair("charge",
1826 strpr("neuron-stats.%s.%06zu.out.stage-%zu",
1827 "charge", epoch, stage)));
1828 }
1829 else if (stage == 2)
1830 {
1831 nnInfo.push_back(
1832 make_pair("charge",
1833 strpr("neuron-stats.%s.%06zu.out.stage-%zu",
1834 "charge", epoch, stage)));
1835 nnInfo.push_back(
1836 make_pair("short",
1837 strpr("neuron-stats.%s.%06zu.out.stage-%zu",
1838 "short", epoch, stage)));
1839 }
1840 }
1841 for (auto const& i : nnInfo) writeNeuronStatistics(i.first, i.second);
1842 }
1843
1844 return;
1845}
1846
1848{
1849 for (vector<Element>::iterator it = elements.begin();
1850 it != elements.end(); ++it)
1851 {
1852 for (auto& nn : it->neuralNetworks) nn.second.resetNeuronStatistics();
1853 }
1854 return;
1855}
1856
1858 string const fileNameFormat) const
1859{
1860 ofstream file;
1861 string fileNameFormatActual = fileNameFormat;
1863 {
1864 fileNameFormatActual += strpr(".stage-%zu", stage);
1865 }
1866
1867 for (size_t i = 0; i < numUpdaters; ++i)
1868 {
1869 string fileName;
1871 {
1872 fileName = strpr(fileNameFormatActual.c_str(), 0);
1873 }
1874 else if (updateStrategy == US_ELEMENT)
1875 {
1876 fileName = strpr(fileNameFormatActual.c_str(),
1878 }
1879 if (append) file.open(fileName.c_str(), ofstream::app);
1880 else
1881 {
1882 file.open(fileName.c_str());
1883 appendLinesToFile(file, updaters.at(i)->statusHeader());
1884 }
1885 file << updaters.at(i)->status(epoch);
1886 file.close();
1887 }
1888
1889 return;
1890}
1891
1892void Training::sortUpdateCandidates(string const& property)
1893{
1894 // Update error for all structures.
1895 for (auto& uc : p[property].updateCandidates)
1896 {
1897 if (property == "energy")
1898 {
1899 Structure const& s = structures.at(uc.s);
1900 uc.error = fabs((s.energyRef - s.energy) / s.numAtoms);
1901 }
1902 else if (property == "force")
1903 {
1904 Atom const& a = structures.at(uc.s).atoms.at(uc.a);
1905 uc.error = fabs(a.fRef[uc.c] - a.f[uc.c]);
1906 }
1907 else if (property == "charge")
1908 {
1909 Atom const& a = structures.at(uc.s).atoms.at(uc.a);
1910 uc.error = fabs(a.chargeRef - a.charge);
1911 }
1912 }
1913 // Sort update candidates list.
1914 sort(p[property].updateCandidates.begin(),
1915 p[property].updateCandidates.end());
1916 // Reset current position.
1917 p[property].posUpdateCandidates = 0;
1918
1919 return;
1920}
1921
1922void Training::shuffleUpdateCandidates(string const& property)
1923{
1924 shuffle(p[property].updateCandidates.begin(),
1925 p[property].updateCandidates.end(),
1926 rngNew);
1927 // Reset current position.
1928 p[property].posUpdateCandidates = 0;
1929
1930 return;
1931}
1932
1934{
1935 // Clear epoch schedule.
1936 epochSchedule.clear();
1937 vector<int>(epochSchedule).swap(epochSchedule);
1938
1939 // Grow schedule vector by each property's number of desired updates.
1940 // Fill this array looping in reverse direction for backward compatibility.
1941 //for (size_t i = 0; i < pk.size(); ++i)
1942 for (int i = pk.size() - 1; i >= 0; --i)
1943 {
1944 epochSchedule.insert(epochSchedule.end(), p[pk.at(i)].numUpdates, i);
1945 }
1946
1947 // Return if there is only a single training property.
1948 if (pk.size() == 1) return;
1949
1950 // Now shuffle the schedule to get a random sequence.
1951 shuffle(epochSchedule.begin(), epochSchedule.end(), rngGlobalNew);
1952
1953 //for (size_t i = 0; i < epochSchedule.size(); ++i)
1954 //{
1955 // log << strpr("%zu %zu\n", i, epochSchedule.at(i));
1956 //}
1957
1958 return;
1959}
1960
1962{
1963 for (auto k : pk)
1964 {
1965 if (p[k].selectionModeSchedule.find(epoch)
1966 != p[k].selectionModeSchedule.end())
1967 {
1968 p[k].selectionMode = p[k].selectionModeSchedule[epoch];
1969 if (epoch != 0)
1970 {
1971 string message = "INFO Switching selection mode for "
1972 "property \"" + k + "\" to ";
1973 if (p[k].selectionMode == SM_RANDOM)
1974 {
1975 message += strpr("SM_RANDOM (%d).\n", p[k].selectionMode);
1976 }
1977 else if (p[k].selectionMode == SM_SORT)
1978 {
1979 message += strpr("SM_SORT (%d).\n", p[k].selectionMode);
1980 }
1981 else if (p[k].selectionMode == SM_THRESHOLD)
1982 {
1983 message += strpr("SM_THRESHOLD (%d).\n",
1984 p[k].selectionMode);
1985 }
1986 log << message;
1987 }
1988 }
1989 }
1990
1991 return;
1992}
1993
1995{
1996 sw["loop"].start();
1997 log << "\n";
1998 log << "*** TRAINING LOOP ***********************"
1999 "**************************************\n";
2000 log << "\n";
2001 printHeader();
2002
2003 // Calculate initial RMSE and write comparison files.
2004 sw["error"].start();
2006 sw["error"].stop();
2007
2008 // Write initial weights to files.
2009 if (myRank == 0) writeWeightsEpoch();
2010
2011 // Write learning curve.
2012 if (myRank == 0) writeLearningCurve(false);
2013
2014 // Write updater status to file.
2015 if (myRank == 0) writeUpdaterStatus(false);
2016
2017 // Write neuron statistics.
2019
2020 // Print timing information.
2021 sw["loop"].stop();
2022 printEpoch();
2023
2024 // Check if training should be continued.
2025 while (advance())
2026 {
2027 sw["loop"].start();
2028
2029 // Increment epoch counter.
2030 epoch++;
2031 log << "------\n";
2032
2033 // Reset update counters.
2034 for (auto k : pk) p[k].countUpdates = 0;
2035
2036 // Check if selection mode should be changed in this epoch.
2038
2039 // Sort or shuffle update candidates.
2040 for (auto k : pk)
2041 {
2042 if (p[k].selectionMode == SM_SORT) sortUpdateCandidates(k);
2044 }
2045
2046 // Determine epoch update schedule.
2048
2049 // Perform property updates according to schedule.
2050 sw["train"].start();
2051 for (auto i : epochSchedule)
2052 {
2053 string property = pk.at(i);
2054 update(property);
2055 p[property].countUpdates++;
2056 }
2057 sw["train"].stop();
2058
2059 // Reset neuron statistics.
2061
2062 // Calculate errors and write comparison files.
2063 sw["error"].start();
2065 sw["error"].stop();
2066
2067 // Write weights to files.
2068 if (myRank == 0) writeWeightsEpoch();
2069
2070 // Append to learning curve.
2071 if (myRank == 0) writeLearningCurve(true);
2072
2073 // Write updater status to file.
2074 if (myRank == 0) writeUpdaterStatus(true);
2075
2076 // Write neuron statistics.
2078
2079 // Print error overview and timing information.
2080 sw["loop"].stop();
2081 printEpoch();
2082
2083 if (myRank == 0) writeTimingData(epoch != 1);
2084 }
2085
2086 log << "-----------------------------------------"
2087 "--------------------------------------\n";
2088 log << strpr("TIMING Training loop finished: %.2f seconds.\n",
2089 sw["loop"].getTotal());
2090 log << "*****************************************"
2091 "**************************************\n";
2092
2093 return;
2094}
2095
2096void Training::update(string const& property)
2097{
2098 // Shortcuts.
2099 string const& k = property; // Property key.
2100 Property& pu = p[k]; // Update property.
2101 // Start watch for error and jacobian computation, reset loop timer if
2102 // first update in this epoch.
2103 bool newLoop = pu.countUpdates == 0;
2104 sw[k].start(newLoop);
2105 sw[k + "_err"].start(newLoop);
2106
2107#ifdef _OPENMP
2108 int num_threads = omp_get_max_threads();
2109 omp_set_num_threads(1);
2110#endif
2111
2113 // PART 1: Find update candidate, compute error fractions and derivatives
2115
2116 size_t batchSize = pu.taskBatchSize;
2117 if (batchSize == 0) batchSize = pu.patternsPerUpdate;
2118 bool derivatives = false;
2119 if (k == "force") derivatives = true;
2120
2121 vector<size_t> thresholdLoopCount(batchSize, 0);
2122 vector<double> currentRmseFraction(batchSize, 0.0);
2123 vector<UpdateCandidate*> currentUpdateCandidates(batchSize, NULL);
2124 for (size_t i = 0; i < numUpdaters; ++i)
2125 {
2126 fill(pu.error.at(i).begin(), pu.error.at(i).end(), 0.0);
2127 fill(pu.jacobian.at(i).begin(), pu.jacobian.at(i).end(), 0.0);
2128 }
2129
2130 // Loop over (mini-)batch size.
2131 for (size_t b = 0; b < batchSize; ++b)
2132 {
2133 UpdateCandidate* c = NULL; // Actual current update candidate.
2134 size_t indexBest = 0; // Index of best update candidate so far.
2135 double rmseFractionBest = 0.0; // RMSE of best update candidate so far.
2136
2137 // For SM_THRESHOLD need to loop until candidate's RMSE is above
2138 // threshold. Other modes don't loop here.
2139 size_t trials = 1;
2140 if (pu.selectionMode == SM_THRESHOLD) trials = pu.rmseThresholdTrials;
2141 size_t il = 0;
2142 for (il = 0; il < trials; ++il)
2143 {
2144 // Restart position index if necessary.
2145 if (pu.posUpdateCandidates >= pu.updateCandidates.size())
2146 {
2147 pu.posUpdateCandidates = 0;
2148 }
2149
2150 //log << strpr("pos %zu b %zu size %zu\n", pu.posUpdateCandidates, b, currentUpdateCandidates.size());
2151 // Set current update candidate.
2152 c = &(pu.updateCandidates.at(pu.posUpdateCandidates));
2153 // Keep update candidates (for logging later).
2154 currentUpdateCandidates.at(b) = c;
2155 // Shortcut for current structure.
2156 Structure& s = structures.at(c->s);
2157 // Calculate symmetry functions (if results are already stored
2158 // these functions will return immediately).
2159#ifdef NOSFGROUPS
2160 calculateSymmetryFunctions(s, derivatives);
2161#else
2162 calculateSymmetryFunctionGroups(s, derivatives);
2163#endif
2164 // For SM_THRESHOLD calculate RMSE of update candidate.
2165 if (pu.selectionMode == SM_THRESHOLD)
2166 {
2167 if (k == "energy")
2168 {
2170 {
2171 calculateAtomicNeuralNetworks(s, derivatives);
2172 calculateEnergy(s);
2173 currentRmseFraction.at(b) =
2174 fabs(s.energyRef - s.energy)
2175 / (s.numAtoms * pu.errorTrain.at("RMSEpa"));
2176 }
2177 // Assume stage 2.
2179 {
2180 // TODO: Reuse already present charge-NN data and
2181 // compute only short-NN energy contributions.
2182 throw runtime_error("ERROR: Not implemented.\n");
2183 }
2184 }
2185 else if (k == "force")
2186 {
2188 {
2189 calculateAtomicNeuralNetworks(s, derivatives);
2190 calculateForces(s);
2191 Atom const& a = s.atoms.at(c->a);
2192 currentRmseFraction.at(b) =
2193 fabs(a.fRef[c->c] - a.f[c->c])
2194 / pu.errorTrain.at("RMSE");
2195 }
2196 // Assume stage 2.
2198 {
2199 // TODO: Reuse already present charge-NN data and
2200 // compute only short-NN force contributions.
2201 throw runtime_error("ERROR: Not implemented.\n");
2202 }
2203 }
2204 else if (k == "charge")
2205 {
2206 // Assume NNPType::SHORT_CHARGE_NN stage 1.
2207 // Compute only charge-NN
2208 Atom& a = s.atoms.at(c->a);
2209 NeuralNetwork& nn =
2210 elements.at(a.element).neuralNetworks.at("charge");
2211 nn.setInput(&(a.G.front()));
2212 nn.propagate();
2213 nn.getOutput(&(a.charge));
2214 currentRmseFraction.at(b) = fabs(a.chargeRef - a.charge)
2215 / pu.errorTrain.at("RMSE");
2216 }
2217 // If force RMSE is above threshold stop loop immediately.
2218 if (currentRmseFraction.at(b) > pu.rmseThreshold)
2219 {
2220 // Increment position in update candidate list.
2222 break;
2223 }
2224 // If loop continues, free memory and remember best candidate
2225 // so far.
2226 if (freeMemory)
2227 {
2228 s.freeAtoms(true);
2229 }
2230 if (currentRmseFraction.at(b) > rmseFractionBest)
2231 {
2232 rmseFractionBest = currentRmseFraction.at(b);
2233 indexBest = pu.posUpdateCandidates;
2234 }
2235 // Increment position in update candidate list.
2237 }
2238 // Break loop for all selection modes but SM_THRESHOLD.
2239 else if (pu.selectionMode == SM_RANDOM ||
2240 pu.selectionMode == SM_SORT)
2241 {
2242 // Increment position in update candidate list.
2244 break;
2245 }
2246 }
2247 thresholdLoopCount.at(b) = il;
2248
2249 // If loop was not stopped because of a proper update candidate found
2250 // (RMSE above threshold) use best candidate during iteration.
2251 if (pu.selectionMode == SM_THRESHOLD && il == trials)
2252 {
2253 // Set best candidate.
2254 currentUpdateCandidates.at(b) =
2255 &(pu.updateCandidates.at(indexBest));
2256 currentRmseFraction.at(b) = rmseFractionBest;
2257 // Need to calculate the symmetry functions again, maybe results
2258 // were not stored.
2259 Structure& s = structures.at(c->s);
2260#ifdef N2P2_NO_SF_GROUPS
2261 calculateSymmetryFunctions(s, derivatives);
2262#else
2263 calculateSymmetryFunctionGroups(s, derivatives);
2264#endif
2265 }
2266
2268 // PART 2: Compute error vector and Jacobian
2270
2271 Structure& s = structures.at(c->s);
2272 // Temporary storage for derivative contributions of atoms (dXdc stores
2273 // dEdc, dFdc or dQdc for energy, force or charge update, respectively.
2274 vector<vector<double>> dXdc;
2275 dXdc.resize(numElements);
2276 for (size_t i = 0; i < numElements; ++i)
2277 {
2278 size_t n = elements.at(i).neuralNetworks.at(nnId)
2279 .getNumConnections();
2280 dXdc.at(i).resize(n, 0.0);
2281 }
2282 // Precalculate offset in Jacobian array.
2283 size_t iu = 0;
2284 vector<size_t> offset(numElements, 0);
2285 for (size_t i = 0; i < numElements; ++i)
2286 {
2287 if (updateStrategy == US_ELEMENT) iu = i;
2288 else iu = 0;
2290 {
2291 offset.at(i) += pu.offsetPerTask.at(myRank)
2292 * numWeightsPerUpdater.at(iu);
2293 //log << strpr("%zu os 1: %zu ", i, offset.at(i));
2294 }
2295 if (jacobianMode == JM_FULL)
2296 {
2297 offset.at(i) += b * numWeightsPerUpdater.at(iu);
2298 //log << strpr("%zu os 2: %zu ", i, offset.at(i));
2299 }
2301 {
2302 offset.at(i) += weightsOffset.at(i);
2303 //log << strpr("%zu os 3: %zu", i, offset.at(i));
2304 }
2305 //log << strpr(" %zu final os: %zu\n", i, offset.at(i));
2306 }
2307 // Now compute Jacobian.
2308 if (k == "energy")
2309 {
2311 {
2312 // Loop over atoms and calculate atomic energy contributions.
2313 for (vector<Atom>::iterator it = s.atoms.begin();
2314 it != s.atoms.end(); ++it)
2315 {
2316 size_t i = it->element;
2317 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(nnId);
2318 nn.setInput(&((it->G).front()));
2319 nn.propagate();
2320 nn.getOutput(&(it->energy));
2321 // Compute derivative of output node with respect to all
2322 // neural network connections (weights + biases).
2323 nn.calculateDEdc(&(dXdc.at(i).front()));
2324 // Finally sum up Jacobian.
2325 if (updateStrategy == US_ELEMENT) iu = i;
2326 else iu = 0;
2327 for (size_t j = 0; j < dXdc.at(i).size(); ++j)
2328 {
2329 pu.jacobian.at(iu).at(offset.at(i) + j) +=
2330 dXdc.at(i).at(j);
2331 }
2332 }
2333 }
2335 {
2336 throw runtime_error("ERROR: Not implemented.\n");
2337 }
2338 }
2339 else if (k == "force")
2340 {
2342 {
2343 // Loop over atoms and calculate atomic energy contributions.
2344 for (vector<Atom>::iterator it = s.atoms.begin();
2345 it != s.atoms.end(); ++it)
2346 {
2347 // For force update save derivative of symmetry function
2348 // with respect to coordinate.
2349#ifndef N2P2_FULL_SFD_MEMORY
2350 collectDGdxia((*it), c->a, c->c);
2351#else
2352 it->collectDGdxia(c->a, c->c);
2353#endif
2354 size_t i = it->element;
2355 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(nnId);
2356 nn.setInput(&((it->G).front()));
2357 nn.propagate();
2358 if (derivatives) nn.calculateDEdG(&((it->dEdG).front()));
2359 nn.getOutput(&(it->energy));
2360 // Compute derivative of output node with respect to all
2361 // neural network connections (weights + biases).
2362#ifndef N2P2_FULL_SFD_MEMORY
2363 nn.calculateDFdc(&(dXdc.at(i).front()),
2364 &(dGdxia.front()));
2365#else
2366 nn.calculateDFdc(&(dXdc.at(i).front()),
2367 &(it->dGdxia.front()));
2368#endif
2369 // Finally sum up Jacobian.
2370 if (updateStrategy == US_ELEMENT) iu = i;
2371 else iu = 0;
2372 for (size_t j = 0; j < dXdc.at(i).size(); ++j)
2373 {
2374 pu.jacobian.at(iu).at(offset.at(i) + j) +=
2375 dXdc.at(i).at(j);
2376 }
2377 }
2378
2379 }
2381 {
2382 throw runtime_error("ERROR: Not implemented.\n");
2383 }
2384 }
2385 else if (k == "charge")
2386 {
2387 // Assume NNPType::SHORT_CHARGE_NN stage 1.
2388 // Shortcut to selected atom.
2389 Atom& a = s.atoms.at(c->a);
2390 size_t i = a.element;
2391 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(nnId);
2392 nn.setInput(&(a.G.front()));
2393 nn.propagate();
2394 nn.getOutput(&(a.charge));
2395 // Compute derivative of output node with respect to all
2396 // neural network connections (weights + biases).
2397 nn.calculateDEdc(&(dXdc.at(i).front()));
2398 // Finally sum up Jacobian.
2399 if (updateStrategy == US_ELEMENT) iu = i;
2400 else iu = 0;
2401 for (size_t j = 0; j < dXdc.at(i).size(); ++j)
2402 {
2403 pu.jacobian.at(iu).at(offset.at(i) + j) +=
2404 dXdc.at(i).at(j);
2405 }
2406 }
2407
2408 // Sum up total potential energy or calculate force.
2409 if (k == "energy")
2410 {
2411 calculateEnergy(s);
2412 currentRmseFraction.at(b) = fabs(s.energyRef - s.energy)
2413 / (s.numAtoms
2414 * pu.errorTrain.at("RMSEpa"));
2415 }
2416 else if (k == "force")
2417 {
2418 calculateForces(s);
2419 Atom const& a = s.atoms.at(c->a);
2420 currentRmseFraction.at(b) = fabs(a.fRef[c->c] - a.f[c->c])
2421 / pu.errorTrain.at("RMSE");
2422 }
2423 else if (k == "charge")
2424 {
2425 Atom const& a = s.atoms.at(c->a);
2426 currentRmseFraction.at(b) = fabs(a.chargeRef - a.charge)
2427 / pu.errorTrain.at("RMSE");
2428 }
2429
2430 // Now symmetry function memory is not required any more for this
2431 // update.
2432 if (freeMemory) s.freeAtoms(true);
2433
2434 // Precalculate offset in error array.
2435 size_t offset2 = 0;
2437 {
2438 offset2 += pu.offsetPerTask.at(myRank);
2439 //log << strpr("os 4: %zu ", offset2);
2440 }
2441 if (jacobianMode == JM_FULL)
2442 {
2443 offset2 += b;
2444 //log << strpr("os 5: %zu ", offset2);
2445 }
2446 //log << strpr(" final os: %zu\n", offset2);
2447
2448
2449 // Compute error vector (depends on update strategy).
2451 {
2452 if (k == "energy")
2453 {
2454 pu.error.at(0).at(offset2) += s.energyRef - s.energy;
2455 }
2456 else if (k == "force")
2457 {
2458 Atom const& a = s.atoms.at(c->a);
2459 pu.error.at(0).at(offset2) += a.fRef[c->c] - a.f[c->c];
2460 }
2461 else if (k == "charge")
2462 {
2463 Atom const& a = s.atoms.at(c->a);
2464 pu.error.at(0).at(offset2) += a.chargeRef - a.charge;
2465 }
2466 }
2467 else if (updateStrategy == US_ELEMENT)
2468 {
2469 for (size_t i = 0; i < numUpdaters; ++i)
2470 {
2471 if (k == "energy")
2472 {
2473 pu.error.at(i).at(offset2) += (s.energyRef - s.energy)
2474 * s.numAtomsPerElement.at(i)
2475 / s.numAtoms;
2476 }
2477 else if (k == "force")
2478 {
2479 Atom const& a = s.atoms.at(c->a);
2480 pu.error.at(i).at(offset2) += (a.fRef[c->c] - a.f[c->c])
2481 * a.numNeighborsPerElement.at(i)
2482 / a.numNeighbors;
2483 }
2484 else if (k == "charge")
2485 {
2486 Atom const& a = s.atoms.at(c->a);
2487 pu.error.at(i).at(offset2) += a.chargeRef - a.charge;
2488 }
2489 }
2490 }
2491 }
2492
2493 // Apply force update weight to error and Jacobian.
2494 if (k == "force")
2495 {
2496 for (size_t i = 0; i < numUpdaters; ++i)
2497 {
2498 for (size_t j = 0; j < pu.error.at(i).size(); ++j)
2499 {
2500 pu.error.at(i).at(j) *= forceWeight;
2501 }
2502 for (size_t j = 0; j < pu.jacobian.at(i).size(); ++j)
2503 {
2504 pu.jacobian.at(i).at(j) *= forceWeight;
2505 }
2506 }
2507 }
2508 sw[k + "_err"].stop();
2509
2511 // PART 3: Communicate error and Jacobian.
2513
2514 sw[k + "_com"].start(newLoop);
2515 if (jacobianMode == JM_SUM)
2516 {
2518 {
2519 for (size_t i = 0; i < numUpdaters; ++i)
2520 {
2521 if (myRank == 0) MPI_Reduce(MPI_IN_PLACE , &(pu.error.at(i).front()), 1, MPI_DOUBLE, MPI_SUM, 0, comm);
2522 else MPI_Reduce(&(pu.error.at(i).front()), &(pu.error.at(i).front()), 1, MPI_DOUBLE, MPI_SUM, 0, comm);
2523 if (myRank == 0) MPI_Reduce(MPI_IN_PLACE , &(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, MPI_SUM, 0, comm);
2524 else MPI_Reduce(&(pu.jacobian.at(i).front()), &(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, MPI_SUM, 0, comm);
2525 }
2526 }
2527 else if (parallelMode == PM_TRAIN_ALL)
2528 {
2529 for (size_t i = 0; i < numUpdaters; ++i)
2530 {
2531 MPI_Allreduce(MPI_IN_PLACE, &(pu.error.at(i).front()), 1, MPI_DOUBLE, MPI_SUM, comm);
2532 MPI_Allreduce(MPI_IN_PLACE, &(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, MPI_SUM, comm);
2533 }
2534 }
2535 }
2536 else if (jacobianMode == JM_TASK)
2537 {
2539 {
2540 for (size_t i = 0; i < numUpdaters; ++i)
2541 {
2542 if (myRank == 0) MPI_Gather(MPI_IN_PLACE , 1, MPI_DOUBLE, &(pu.error.at(i).front()), 1, MPI_DOUBLE, 0, comm);
2543 else MPI_Gather(&(pu.error.at(i).front()), 1, MPI_DOUBLE, NULL , 1, MPI_DOUBLE, 0, comm);
2544 if (myRank == 0) MPI_Gather(MPI_IN_PLACE , numWeightsPerUpdater.at(i), MPI_DOUBLE, &(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, 0, comm);
2545 else MPI_Gather(&(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, NULL , numWeightsPerUpdater.at(i), MPI_DOUBLE, 0, comm);
2546 }
2547 }
2548 else if (parallelMode == PM_TRAIN_ALL)
2549 {
2550 for (size_t i = 0; i < numUpdaters; ++i)
2551 {
2552 MPI_Allgather(MPI_IN_PLACE, 1, MPI_DOUBLE, &(pu.error.at(i).front()), 1, MPI_DOUBLE, comm);
2553 MPI_Allgather(MPI_IN_PLACE, numWeightsPerUpdater.at(i), MPI_DOUBLE, &(pu.jacobian.at(i).front()), numWeightsPerUpdater.at(i), MPI_DOUBLE, comm);
2554 }
2555 }
2556 }
2557 else if (jacobianMode == JM_FULL)
2558 {
2560 {
2561 for (size_t i = 0; i < numUpdaters; ++i)
2562 {
2563 if (myRank == 0) MPI_Gatherv(MPI_IN_PLACE , 0 , MPI_DOUBLE, &(pu.error.at(i).front()), &(pu.errorsPerTask.front()), &(pu.offsetPerTask.front()), MPI_DOUBLE, 0, comm);
2564 else MPI_Gatherv(&(pu.error.at(i).front()), pu.errorsPerTask.at(myRank), MPI_DOUBLE, NULL , NULL , NULL , MPI_DOUBLE, 0, comm);
2565 if (myRank == 0) MPI_Gatherv(MPI_IN_PLACE , 0 , MPI_DOUBLE, &(pu.jacobian.at(i).front()), &(pu.weightsPerTask.at(i).front()), &(pu.offsetJacobian.at(i).front()), MPI_DOUBLE, 0, comm);
2566 else MPI_Gatherv(&(pu.jacobian.at(i).front()), pu.weightsPerTask.at(i).at(myRank), MPI_DOUBLE, NULL , NULL , NULL , MPI_DOUBLE, 0, comm);
2567 }
2568 }
2569 else if (parallelMode == PM_TRAIN_ALL)
2570 {
2571 for (size_t i = 0; i < numUpdaters; ++i)
2572 {
2573 MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DOUBLE, &(pu.error.at(i).front()), &(pu.errorsPerTask.front()), &(pu.offsetPerTask.front()), MPI_DOUBLE, comm);
2574 MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DOUBLE, &(pu.jacobian.at(i).front()), &(pu.weightsPerTask.at(i).front()), &(pu.offsetJacobian.at(i).front()), MPI_DOUBLE, comm);
2575 }
2576 }
2577 }
2578 sw[k + "_com"].stop();
2579
2581 // PART 4: Perform weight update and apply new weights.
2583
2584 sw[k + "_upd"].start(newLoop);
2585#ifdef _OPENMP
2586 omp_set_num_threads(num_threads);
2587#endif
2588 // Loop over all updaters.
2589 for (size_t i = 0; i < updaters.size(); ++i)
2590 {
2591 updaters.at(i)->setError(&(pu.error.at(i).front()),
2592 pu.error.at(i).size());
2593 updaters.at(i)->setJacobian(&(pu.jacobian.at(i).front()),
2594 pu.error.at(i).size());
2595 if (updaterType == UT_KF)
2596 {
2597 KalmanFilter* kf = dynamic_cast<KalmanFilter*>(updaters.at(i));
2598 kf->setSizeObservation(pu.error.at(i).size());
2599 }
2600 updaters.at(i)->update();
2601 }
2602 countUpdates++;
2603
2604 // Redistribute weights to all MPI tasks.
2606 {
2607 for (size_t i = 0; i < numUpdaters; ++i)
2608 {
2609 MPI_Bcast(&(weights.at(i).front()), weights.at(i).size(), MPI_DOUBLE, 0, comm);
2610 }
2611 }
2612
2613 // Set new weights in neural networks.
2614 setWeights();
2615 sw[k + "_upd"].stop();
2616
2618 // PART 5: Communicate candidates and RMSE fractions and write log.
2620
2621 sw[k + "_log"].start(newLoop);
2622 if (writeTrainingLog)
2623 {
2624 vector<int> procUpdateCandidate;
2625 vector<size_t> indexStructure;
2626 vector<size_t> indexStructureGlobal;
2627 vector<size_t> indexAtom;
2628 vector<size_t> indexCoordinate;
2629
2630 vector<int> currentUpdateCandidatesPerTask;
2631 vector<int> currentUpdateCandidatesOffset;
2632 int myCurrentUpdateCandidates = currentUpdateCandidates.size();
2633
2634 if (myRank == 0)
2635 {
2636 currentUpdateCandidatesPerTask.resize(numProcs, 0);
2637 currentUpdateCandidatesPerTask.at(0) = myCurrentUpdateCandidates;
2638 }
2639 if (myRank == 0) MPI_Gather(MPI_IN_PLACE , 1, MPI_INT, &(currentUpdateCandidatesPerTask.front()), 1, MPI_INT, 0, comm);
2640 else MPI_Gather(&(myCurrentUpdateCandidates), 1, MPI_INT, NULL , 1, MPI_INT, 0, comm);
2641
2642 if (myRank == 0)
2643 {
2644 int totalUpdateCandidates = 0;
2645 for (size_t i = 0; i < currentUpdateCandidatesPerTask.size(); ++i)
2646 {
2647 currentUpdateCandidatesOffset.push_back(totalUpdateCandidates);
2648 totalUpdateCandidates += currentUpdateCandidatesPerTask.at(i);
2649 }
2650 procUpdateCandidate.resize(totalUpdateCandidates, 0);
2651 indexStructure.resize(totalUpdateCandidates, 0);
2652 indexStructureGlobal.resize(totalUpdateCandidates, 0);
2653 indexAtom.resize(totalUpdateCandidates, 0);
2654 indexCoordinate.resize(totalUpdateCandidates, 0);
2655 // Increase size of this vectors (only rank 0).
2656 currentRmseFraction.resize(totalUpdateCandidates, 0.0);
2657 thresholdLoopCount.resize(totalUpdateCandidates, 0.0);
2658 }
2659 else
2660 {
2661 procUpdateCandidate.resize(myCurrentUpdateCandidates, 0);
2662 indexStructure.resize(myCurrentUpdateCandidates, 0);
2663 indexStructureGlobal.resize(myCurrentUpdateCandidates, 0);
2664 indexAtom.resize(myCurrentUpdateCandidates, 0);
2665 indexCoordinate.resize(myCurrentUpdateCandidates, 0);
2666 }
2667 for (int i = 0; i < myCurrentUpdateCandidates; ++i)
2668 {
2669 procUpdateCandidate.at(i) = myRank;
2670 UpdateCandidate& c = *(currentUpdateCandidates.at(i));
2671 indexStructure.at(i) = c.s;
2672 indexStructureGlobal.at(i) = structures.at(c.s).index;
2673 indexAtom.at(i) = c.a;
2674 indexCoordinate.at(i) = c.c;
2675 }
2676 if (myRank == 0)
2677 {
2678 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_DOUBLE, &(currentRmseFraction.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_DOUBLE, 0, comm);
2679 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_SIZE_T, &(thresholdLoopCount.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_SIZE_T, 0, comm);
2680 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_INT , &(procUpdateCandidate.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_INT , 0, comm);
2681 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_SIZE_T, &(indexStructure.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_SIZE_T, 0, comm);
2682 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_SIZE_T, &(indexStructureGlobal.front()), &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_SIZE_T, 0, comm);
2683 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_SIZE_T, &(indexAtom.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_SIZE_T, 0, comm);
2684 MPI_Gatherv(MPI_IN_PLACE, 0, MPI_SIZE_T, &(indexCoordinate.front()) , &(currentUpdateCandidatesPerTask.front()), &(currentUpdateCandidatesOffset.front()), MPI_SIZE_T, 0, comm);
2685 }
2686 else
2687 {
2688 MPI_Gatherv(&(currentRmseFraction.front()) , myCurrentUpdateCandidates, MPI_DOUBLE, NULL, NULL, NULL, MPI_DOUBLE, 0, comm);
2689 MPI_Gatherv(&(thresholdLoopCount.front()) , myCurrentUpdateCandidates, MPI_SIZE_T, NULL, NULL, NULL, MPI_SIZE_T, 0, comm);
2690 MPI_Gatherv(&(procUpdateCandidate.front()) , myCurrentUpdateCandidates, MPI_INT , NULL, NULL, NULL, MPI_INT , 0, comm);
2691 MPI_Gatherv(&(indexStructure.front()) , myCurrentUpdateCandidates, MPI_SIZE_T, NULL, NULL, NULL, MPI_SIZE_T, 0, comm);
2692 MPI_Gatherv(&(indexStructureGlobal.front()), myCurrentUpdateCandidates, MPI_SIZE_T, NULL, NULL, NULL, MPI_SIZE_T, 0, comm);
2693 MPI_Gatherv(&(indexAtom.front()) , myCurrentUpdateCandidates, MPI_SIZE_T, NULL, NULL, NULL, MPI_SIZE_T, 0, comm);
2694 MPI_Gatherv(&(indexCoordinate.front()) , myCurrentUpdateCandidates, MPI_SIZE_T, NULL, NULL, NULL, MPI_SIZE_T, 0, comm);
2695 }
2696
2697 if (myRank == 0)
2698 {
2699 for (size_t i = 0; i < procUpdateCandidate.size(); ++i)
2700 {
2701 if (k == "energy")
2702 {
2703 addTrainingLogEntry(procUpdateCandidate.at(i),
2704 thresholdLoopCount.at(i),
2705 currentRmseFraction.at(i),
2706 indexStructureGlobal.at(i),
2707 indexStructure.at(i));
2708 }
2709 else if (k == "force")
2710 {
2711 addTrainingLogEntry(procUpdateCandidate.at(i),
2712 thresholdLoopCount.at(i),
2713 currentRmseFraction.at(i),
2714 indexStructureGlobal.at(i),
2715 indexStructure.at(i),
2716 indexAtom.at(i),
2717 indexCoordinate.at(i));
2718 }
2719 else if (k == "charge")
2720 {
2721 addTrainingLogEntry(procUpdateCandidate.at(i),
2722 thresholdLoopCount.at(i),
2723 currentRmseFraction.at(i),
2724 indexStructureGlobal.at(i),
2725 indexStructure.at(i),
2726 indexAtom.at(i));
2727 }
2728 }
2729 }
2730 }
2731 sw[k + "_log"].stop();
2732 sw[k].stop();
2733
2734 return;
2735}
2736
2737double Training::getSingleWeight(size_t element, size_t index)
2738{
2739 getWeights();
2740
2741 return weights.at(element).at(index);
2742}
2743
2744void Training::setSingleWeight(size_t element, size_t index, double value)
2745{
2746 weights.at(element).at(index) = value;
2747 setWeights();
2748
2749 return;
2750}
2751
2752vector<
2754{
2755 Structure& s = *structure;
2756#ifdef N2P2_NO_SF_GROUPS
2758#else
2760#endif
2761
2762 vector<vector<double> > dEdc;
2763 vector<vector<double> > dedc;
2764 dEdc.resize(numElements);
2765 dedc.resize(numElements);
2766 for (size_t i = 0; i < numElements; ++i)
2767 {
2768 size_t n = elements.at(i).neuralNetworks.at("short")
2769 .getNumConnections();
2770 dEdc.at(i).resize(n, 0.0);
2771 dedc.at(i).resize(n, 0.0);
2772 }
2773 for (vector<Atom>::iterator it = s.atoms.begin();
2774 it != s.atoms.end(); ++it)
2775 {
2776 size_t i = it->element;
2777 NeuralNetwork& nn = elements.at(i).neuralNetworks.at("short");
2778 nn.setInput(&((it->G).front()));
2779 nn.propagate();
2780 nn.getOutput(&(it->energy));
2781 nn.calculateDEdc(&(dedc.at(i).front()));
2782 for (size_t j = 0; j < dedc.at(i).size(); ++j)
2783 {
2784 dEdc.at(i).at(j) += dedc.at(i).at(j);
2785 }
2786 }
2787
2788 return dEdc;
2789}
2790
2791// Doxygen requires namespace prefix for arguments...
2792vector<
2794 std::size_t atom,
2795 std::size_t component)
2796{
2797 Structure& s = *structure;
2798#ifdef N2P2_NO_SF_GROUPS
2800#else
2802#endif
2803
2804 vector<vector<double> > dFdc;
2805 vector<vector<double> > dfdc;
2806 dFdc.resize(numElements);
2807 dfdc.resize(numElements);
2808 for (size_t i = 0; i < numElements; ++i)
2809 {
2810 size_t n = elements.at(i).neuralNetworks.at("short")
2811 .getNumConnections();
2812 dFdc.at(i).resize(n, 0.0);
2813 dfdc.at(i).resize(n, 0.0);
2814 }
2815 for (vector<Atom>::iterator it = s.atoms.begin();
2816 it != s.atoms.end(); ++it)
2817 {
2818#ifndef N2P2_FULL_SFD_MEMORY
2819 collectDGdxia((*it), atom, component);
2820#else
2821 it->collectDGdxia(atom, component);
2822#endif
2823 size_t i = it->element;
2824 NeuralNetwork& nn = elements.at(i).neuralNetworks.at("short");
2825 nn.setInput(&((it->G).front()));
2826 nn.propagate();
2827 nn.getOutput(&(it->energy));
2828#ifndef N2P2_FULL_SFD_MEMORY
2829 nn.calculateDFdc(&(dfdc.at(i).front()), &(dGdxia.front()));
2830#else
2831 nn.calculateDFdc(&(dfdc.at(i).front()), &(it->dGdxia.front()));
2832#endif
2833 for (size_t j = 0; j < dfdc.at(i).size(); ++j)
2834 {
2835 dFdc.at(i).at(j) += dfdc.at(i).at(j);
2836 }
2837 }
2838
2839 return dFdc;
2840}
2841
2843{
2844 trainingLogFileName = fileName;
2845
2846 return;
2847}
2848
2849size_t Training::getNumConnections(string id) const
2850{
2851 size_t n = 0;
2852 for (auto const& e : elements)
2853 {
2854 n += e.neuralNetworks.at(id).getNumConnections();
2855 }
2856
2857 return n;
2858}
2859
2860vector<size_t> Training::getNumConnectionsPerElement(string id) const
2861{
2862 vector<size_t> npe;
2863 for (auto const& e : elements)
2864 {
2865 npe.push_back(e.neuralNetworks.at(id).getNumConnections());
2866 }
2867
2868 return npe;
2869}
2870
2871vector<size_t> Training::getConnectionOffsets(string id) const
2872{
2873 vector<size_t> offset;
2874 size_t n = 0;
2875 for (auto const& e : elements)
2876 {
2877 offset.push_back(n);
2878 n += e.neuralNetworks.at(id).getNumConnections();
2879 }
2880
2881 return offset;
2882}
2883
2884void Training::dPdc(string property,
2885 Structure& structure,
2886 vector<vector<double>>& dPdc)
2887{
2888 auto npe = getNumConnectionsPerElement();
2889 auto off = getConnectionOffsets();
2890 dPdc.clear();
2891
2892 if (property == "energy")
2893 {
2894 dPdc.resize(1);
2895 dPdc.at(0).resize(getNumConnections(), 0.0);
2896 for (auto const& a : structure.atoms)
2897 {
2898 size_t e = a.element;
2899 NeuralNetwork& nn = elements.at(e).neuralNetworks.at(nnId);
2900 nn.setInput(a.G.data());
2901 nn.propagate();
2902 vector<double> tmp(npe.at(e), 0.0);
2903 nn.calculateDEdc(tmp.data());
2904 for (size_t j = 0; j < tmp.size(); ++j)
2905 {
2906 dPdc.at(0).at(off.at(e) + j) += tmp.at(j);
2907 }
2908 }
2909 }
2910 else if (property == "force")
2911 {
2912 dPdc.resize(3 * structure.numAtoms);
2913 size_t count = 0;
2914 for (size_t ia = 0; ia < structure.numAtoms; ++ia)
2915 {
2916 for (size_t ixyz = 0; ixyz < 3; ++ixyz)
2917 {
2918 dPdc.at(count).resize(getNumConnections(), 0.0);
2919 for (auto& a : structure.atoms)
2920 {
2921#ifndef N2P2_FULL_SFD_MEMORY
2922 collectDGdxia(a, ia, ixyz);
2923#else
2924 a.collectDGdxia(ia, ixyz);
2925#endif
2926 size_t e = a.element;
2927 NeuralNetwork& nn = elements.at(e).neuralNetworks.at(nnId);
2928 nn.setInput(a.G.data());
2929 nn.propagate();
2930 nn.calculateDEdG(a.dEdG.data());
2931 nn.getOutput(&(a.energy));
2932 vector<double> tmp(npe.at(e), 0.0);
2933#ifndef N2P2_FULL_SFD_MEMORY
2934 nn.calculateDFdc(tmp.data(), dGdxia.data());
2935#else
2936 nn.calculateDFdc(tmp.data(), a.dGdxia.data());
2937#endif
2938 for (size_t j = 0; j < tmp.size(); ++j)
2939 {
2940 dPdc.at(count).at(off.at(e) + j) += tmp.at(j);
2941 }
2942 }
2943 count++;
2944 }
2945 }
2946 }
2947 else
2948 {
2949 throw runtime_error("ERROR: Weight derivatives not implemented for "
2950 "property \"" + property + "\".\n");
2951 }
2952
2953 return;
2954}
2955
2956void Training::dPdcN(string property,
2957 Structure& structure,
2958 vector<vector<double>>& dPdc,
2959 double delta)
2960{
2961 auto npe = getNumConnectionsPerElement();
2962 auto off = getConnectionOffsets();
2963 dPdc.clear();
2964
2965 if (property == "energy")
2966 {
2967 dPdc.resize(1);
2968 for (size_t ie = 0; ie < numElements; ++ie)
2969 {
2970 for (size_t ic = 0; ic < npe.at(ie); ++ic)
2971 {
2972 size_t const o = off.at(ie) + ic;
2973 double const w = weights.at(0).at(o);
2974
2975 weights.at(0).at(o) += delta;
2976 setWeights();
2977 calculateAtomicNeuralNetworks(structure, false);
2978 calculateEnergy(structure);
2979 double energyHigh = structure.energy;
2980
2981 weights.at(0).at(o) -= 2.0 * delta;
2982 setWeights();
2983 calculateAtomicNeuralNetworks(structure, false);
2984 calculateEnergy(structure);
2985 double energyLow = structure.energy;
2986
2987 dPdc.at(0).push_back((energyHigh - energyLow) / (2.0 * delta));
2988 weights.at(0).at(o) = w;
2989 }
2990 }
2991 }
2992 else if (property == "force")
2993 {
2994 size_t count = 0;
2995 dPdc.resize(3 * structure.numAtoms);
2996 for (size_t ia = 0; ia < structure.numAtoms; ++ia)
2997 {
2998 for (size_t ixyz = 0; ixyz < 3; ++ixyz)
2999 {
3000 for (size_t ie = 0; ie < numElements; ++ie)
3001 {
3002 for (size_t ic = 0; ic < npe.at(ie); ++ic)
3003 {
3004 size_t const o = off.at(ie) + ic;
3005 double const w = weights.at(0).at(o);
3006
3007 weights.at(0).at(o) += delta;
3008 setWeights();
3009 calculateAtomicNeuralNetworks(structure, true);
3010 calculateForces(structure);
3011 double forceHigh = structure.atoms.at(ia).f[ixyz];
3012
3013 weights.at(0).at(o) -= 2.0 * delta;
3014 setWeights();
3015 calculateAtomicNeuralNetworks(structure, true);
3016 calculateForces(structure);
3017 double forceLow = structure.atoms.at(ia).f[ixyz];
3018
3019 dPdc.at(count).push_back((forceHigh - forceLow)
3020 / (2.0 * delta));
3021 weights.at(0).at(o) = w;
3022 }
3023 }
3024 count++;
3025 }
3026 }
3027 }
3028 else
3029 {
3030 throw runtime_error("ERROR: Numeric weight derivatives not "
3031 "implemented for property \""
3032 + property + "\".\n");
3033 }
3034
3035 return;
3036}
3037
3039{
3040 if (epoch < numEpochs) return true;
3041 else return false;
3042}
3043
3045{
3047 {
3048 size_t pos = 0;
3049 for (size_t i = 0; i < numElements; ++i)
3050 {
3051 NeuralNetwork const& nn = elements.at(i).neuralNetworks.at(nnId);
3052 nn.getConnections(&(weights.at(0).at(pos)));
3053 pos += nn.getNumConnections();
3054 }
3055 }
3056 else if (updateStrategy == US_ELEMENT)
3057 {
3058 for (size_t i = 0; i < numElements; ++i)
3059 {
3060 NeuralNetwork const& nn = elements.at(i).neuralNetworks.at(nnId);
3061 nn.getConnections(&(weights.at(i).front()));
3062 }
3063 }
3064
3065 return;
3066}
3067
3069{
3071 {
3072 size_t pos = 0;
3073 for (size_t i = 0; i < numElements; ++i)
3074 {
3075 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(nnId);
3076 nn.setConnections(&(weights.at(0).at(pos)));
3077 pos += nn.getNumConnections();
3078 }
3079 }
3080 else if (updateStrategy == US_ELEMENT)
3081 {
3082 for (size_t i = 0; i < numElements; ++i)
3083 {
3084 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(nnId);
3085 nn.setConnections(&(weights.at(i).front()));
3086 }
3087 }
3088
3089 return;
3090}
3091
3092// Doxygen requires namespace prefix for arguments...
3094 std::size_t il,
3095 double f,
3096 std::size_t isg,
3097 std::size_t is)
3098{
3099 string s = strpr(" E %5zu %10zu %5d %3zu %10.2E %10zu %5zu\n",
3100 epoch, countUpdates, proc, il + 1, f, isg, is);
3101 trainingLog << s;
3102
3103 return;
3104}
3105
3106// Doxygen requires namespace prefix for arguments...
3108 std::size_t il,
3109 double f,
3110 std::size_t isg,
3111 std::size_t is,
3112 std::size_t ia,
3113 std::size_t ic)
3114{
3115 string s = strpr(" F %5zu %10zu %5d %3zu %10.2E %10zu %5zu %5zu %2zu\n",
3116 epoch, countUpdates, proc, il + 1, f, isg, is, ia, ic);
3117 trainingLog << s;
3118
3119 return;
3120}
3121
3122// Doxygen requires namespace prefix for arguments...
3124 std::size_t il,
3125 double f,
3126 std::size_t isg,
3127 std::size_t is,
3128 std::size_t ia)
3129{
3130 string s = strpr(" Q %5zu %10zu %5d %3zu %10.2E %10zu %5zu %5zu\n",
3131 epoch, countUpdates, proc, il + 1, f, isg, is, ia);
3132 trainingLog << s;
3133
3134 return;
3135}
3136
3137#ifndef N2P2_FULL_SFD_MEMORY
3139 size_t indexAtom,
3140 size_t indexComponent)
3141{
3142 size_t const nsf = atom.numSymmetryFunctions;
3143
3144 // Reset dGdxia array.
3145 dGdxia.clear();
3146 vector<double>(dGdxia).swap(dGdxia);
3147 dGdxia.resize(nsf, 0.0);
3148
3149 vector<vector<size_t> > const& tableFull
3150 = elements.at(atom.element).getSymmetryFunctionTable();
3151
3152 for (size_t i = 0; i < atom.numNeighbors; i++)
3153 {
3154 if (atom.neighbors[i].index == indexAtom)
3155 {
3156 Atom::Neighbor const& n = atom.neighbors[i];
3157 vector<size_t> const& table = tableFull.at(n.element);
3158 for (size_t j = 0; j < n.dGdr.size(); ++j)
3159 {
3160 dGdxia[table.at(j)] += n.dGdr[j][indexComponent];
3161 }
3162 }
3163 }
3164 if (atom.index == indexAtom)
3165 {
3166 for (size_t i = 0; i < nsf; ++i)
3167 {
3168 dGdxia[i] += atom.dGdr[i][indexComponent];
3169 }
3170 }
3171
3172 return;
3173}
3174#endif
3175
3177{
3178 string keywordNW = "";
3179 if (type == "short" ) keywordNW = "nguyen_widrow_weights_short";
3180 else if (type == "charge") keywordNW = "nguyen_widrow_weights_charge";
3181 else
3182 {
3183 throw runtime_error("ERROR: Unknown neural network type.\n");
3184 }
3185
3186 double minWeights = atof(settings["weights_min"].c_str());
3187 double maxWeights = atof(settings["weights_max"].c_str());
3188 log << strpr("Initial weights selected randomly in interval "
3189 "[%f, %f).\n", minWeights, maxWeights);
3190 vector<double> w;
3191 for (size_t i = 0; i < numElements; ++i)
3192 {
3193 NeuralNetwork& nn = elements.at(i).neuralNetworks.at(type);
3194 w.resize(nn.getNumConnections(), 0);
3195 for (size_t j = 0; j < w.size(); ++j)
3196 {
3197 w.at(j) = minWeights + gsl_rng_uniform(rngGlobal)
3198 * (maxWeights - minWeights);
3199 }
3200 nn.setConnections(&(w.front()));
3201 }
3202 if (settings.keywordExists(keywordNW))
3203 {
3204 log << "Weights modified according to Nguyen Widrow scheme.\n";
3205 for (vector<Element>::iterator it = elements.begin();
3206 it != elements.end(); ++it)
3207 {
3208 NeuralNetwork& nn = it->neuralNetworks.at(type);
3210 }
3211 }
3212 else if (settings.keywordExists("precondition_weights"))
3213 {
3214 throw runtime_error("ERROR: Preconditioning of weights not yet"
3215 " implemented.\n");
3216 }
3217 else
3218 {
3219 log << "Weights modified accoring to Glorot Bengio scheme.\n";
3220 //log << "Weights connected to output layer node set to zero.\n";
3221 log << "Biases set to zero.\n";
3222 for (vector<Element>::iterator it = elements.begin();
3223 it != elements.end(); ++it)
3224 {
3225 NeuralNetwork& nn = it->neuralNetworks.at(type);
3227 //nn->modifyConnections(NeuralNetwork::MS_ZEROOUTPUTWEIGHTS);
3229 }
3230 }
3231
3232 return;
3233}
3234
3235void Training::setupSelectionMode(string const& property)
3236{
3237 bool all = (property == "all");
3238 bool isProperty = (find(pk.begin(), pk.end(), property) != pk.end());
3239 if (!(all || isProperty))
3240 {
3241 throw runtime_error("ERROR: Unknown property for selection mode"
3242 " setup.\n");
3243 }
3244
3245 if (all)
3246 {
3247 if (!(settings.keywordExists("selection_mode") ||
3248 settings.keywordExists("rmse_threshold") ||
3249 settings.keywordExists("rmse_threshold_trials"))) return;
3250 log << "Global selection mode settings:\n";
3251 }
3252 else
3253 {
3254 if (!(settings.keywordExists("selection_mode_" + property) ||
3255 settings.keywordExists("rmse_threshold_" + property) ||
3256 settings.keywordExists("rmse_threshold_trials_"
3257 + property))) return;
3258 log << "Selection mode settings specific to property \""
3259 << property << "\":\n";
3260 }
3261 string keyword;
3262 if (all) keyword = "selection_mode";
3263 else keyword = "selection_mode_" + property;
3264
3265 if (settings.keywordExists(keyword))
3266 {
3267 map<size_t, SelectionMode> schedule;
3268 vector<string> args = split(settings[keyword]);
3269 if (args.size() % 2 != 1)
3270 {
3271 throw runtime_error("ERROR: Incorrect selection mode format.\n");
3272 }
3273 schedule[0] = (SelectionMode)atoi(args.at(0).c_str());
3274 for (size_t i = 1; i < args.size(); i = i + 2)
3275 {
3276 schedule[(size_t)atoi(args.at(i).c_str())] =
3277 (SelectionMode)atoi(args.at(i + 1).c_str());
3278 }
3279 for (map<size_t, SelectionMode>::const_iterator it = schedule.begin();
3280 it != schedule.end(); ++it)
3281 {
3282 log << strpr("- Selection mode starting with epoch %zu:\n",
3283 it->first);
3284 if (it->second == SM_RANDOM)
3285 {
3286 log << strpr(" Random selection of update candidates: "
3287 "SelectionMode::SM_RANDOM (%d)\n", it->second);
3288 }
3289 else if (it->second == SM_SORT)
3290 {
3291 log << strpr(" Update candidates selected according to error: "
3292 "SelectionMode::SM_SORT (%d)\n", it->second);
3293 }
3294 else if (it->second == SM_THRESHOLD)
3295 {
3296 log << strpr(" Update candidates chosen randomly above RMSE "
3297 "threshold: SelectionMode::SM_THRESHOLD (%d)\n",
3298 it->second);
3299 }
3300 else
3301 {
3302 throw runtime_error("ERROR: Unknown selection mode.\n");
3303 }
3304 }
3305 if (all)
3306 {
3307 for (auto& i : p)
3308 {
3309 i.second.selectionModeSchedule = schedule;
3310 i.second.selectionMode = schedule[0];
3311 }
3312 }
3313 else
3314 {
3315 p[property].selectionModeSchedule = schedule;
3316 p[property].selectionMode = schedule[0];
3317 }
3318 }
3319
3320 if (all) keyword = "rmse_threshold";
3321 else keyword = "rmse_threshold_" + property;
3322 if (settings.keywordExists(keyword))
3323 {
3324 double t = atof(settings[keyword].c_str());
3325 log << strpr("- RMSE selection threshold: %.2f * RMSE\n", t);
3326 if (all) for (auto& i : p) i.second.rmseThreshold = t;
3327 else p[property].rmseThreshold = t;
3328 }
3329
3330 if (all) keyword = "rmse_threshold_trials";
3331 else keyword = "rmse_threshold_trials_" + property;
3332 if (settings.keywordExists(keyword))
3333 {
3334 size_t t = atoi(settings[keyword].c_str());
3335 log << strpr("- RMSE selection trials : %zu\n", t);
3336 if (all) for (auto& i : p) i.second.rmseThresholdTrials = t;
3337 else p[property].rmseThresholdTrials = t;
3338 }
3339
3340 return;
3341}
3342
3343void Training::setupFileOutput(string const& type)
3344{
3345 string keyword = "write_";
3346 bool isProperty = (find(pk.begin(), pk.end(), type) != pk.end());
3347 if (type == "energy" ) keyword += "trainpoints";
3348 else if (type == "force" ) keyword += "trainforces";
3349 else if (type == "charge" ) keyword += "traincharges";
3350 else if (type == "weights_epoch") keyword += type;
3351 else if (type == "neuronstats" ) keyword += type;
3352 else
3353 {
3354 throw runtime_error("ERROR: Invalid type for file output setup.\n");
3355 }
3356
3357 // Check how often energy comparison files should be written.
3358 if (settings.keywordExists(keyword))
3359 {
3360 size_t* writeEvery = nullptr;
3361 size_t* writeAlways = nullptr;
3362 string message;
3363 if (isProperty)
3364 {
3365 writeEvery = &(p[type].writeCompEvery);
3366 writeAlways = &(p[type].writeCompAlways);
3367 message = "Property \"" + type + "\" comparison";
3368 message.at(0) = toupper(message.at(0));
3369 }
3370 else if (type == "weights_epoch")
3371 {
3372 writeEvery = &writeWeightsEvery;
3373 writeAlways = &writeWeightsAlways;
3374 message = "Weight";
3375 }
3376 else if (type == "neuronstats")
3377 {
3378 writeEvery = &writeNeuronStatisticsEvery;
3379 writeAlways = &writeNeuronStatisticsAlways;
3380 message = "Neuron statistics";
3381 }
3382
3383 *writeEvery = 1;
3384 vector<string> v = split(reduce(settings[keyword]));
3385 if (v.size() == 1) *writeEvery = (size_t)atoi(v.at(0).c_str());
3386 else if (v.size() == 2)
3387 {
3388 *writeEvery = (size_t)atoi(v.at(0).c_str());
3389 *writeAlways = (size_t)atoi(v.at(1).c_str());
3390 }
3391 log << strpr((message
3392 + " files will be written every %zu epochs.\n").c_str(),
3393 *writeEvery);
3394 if (*writeAlways > 0)
3395 {
3396 log << strpr((message
3397 + " files will always be written up to epoch "
3398 "%zu.\n").c_str(), *writeAlways);
3399 }
3400 }
3401
3402 return;
3403}
3404
3405void Training::setupUpdatePlan(string const& property)
3406{
3407 bool isProperty = (find(pk.begin(), pk.end(), property) != pk.end());
3408 if (!isProperty)
3409 {
3410 throw runtime_error("ERROR: Unknown property for update plan"
3411 " setup.\n");
3412 }
3413
3414 // Actual property modified here.
3415 Property& pa = p[property];
3416 string keyword = property + "_fraction";
3417
3418 // Override force fraction if keyword "energy_force_ratio" is provided.
3419 if (property == "force" &&
3420 p.exists("energy") &&
3421 settings.keywordExists("force_energy_ratio"))
3422 {
3423 double const ratio = atof(settings["force_energy_ratio"].c_str());
3424 if (settings.keywordExists(keyword))
3425 {
3426 log << "WARNING: Given force fraction is ignored because "
3427 "force/energy ratio is provided.\n";
3428 }
3429 log << strpr("Desired force/energy update ratio : %.6f\n",
3430 ratio);
3431 log << "----------------------------------------------\n";
3432 pa.epochFraction = (p["energy"].numTrainPatterns * ratio)
3433 / p["force"].numTrainPatterns;
3434 }
3435 // Default action = read "<property>_fraction" keyword.
3436 else
3437 {
3438 pa.epochFraction = atof(settings[keyword].c_str());
3439 }
3440
3441 keyword = "task_batch_size_" + property;
3442 pa.taskBatchSize = (size_t)atoi(settings[keyword].c_str());
3443 if (pa.taskBatchSize == 0)
3444 {
3446 static_cast<size_t>(pa.updateCandidates.size() * pa.epochFraction);
3447 pa.numUpdates = 1;
3448 }
3449 else
3450 {
3452 pa.numUpdates =
3453 static_cast<size_t>((pa.numTrainPatterns * pa.epochFraction)
3454 / pa.taskBatchSize / numProcs);
3455 }
3457 MPI_Allreduce(MPI_IN_PLACE, &(pa.patternsPerUpdateGlobal), 1, MPI_SIZE_T, MPI_SUM, comm);
3458 pa.errorsPerTask.resize(numProcs, 0);
3459 if (jacobianMode == JM_FULL)
3460 {
3461 pa.errorsPerTask.at(myRank) = static_cast<int>(pa.patternsPerUpdate);
3462 }
3463 else
3464 {
3465 pa.errorsPerTask.at(myRank) = 1;
3466 }
3467 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, &(pa.errorsPerTask.front()), 1, MPI_INT, comm);
3468 if (jacobianMode == JM_FULL)
3469 {
3470 pa.weightsPerTask.resize(numUpdaters);
3471 for (size_t i = 0; i < numUpdaters; ++i)
3472 {
3473 pa.weightsPerTask.at(i).resize(numProcs, 0);
3474 for (int j = 0; j < numProcs; ++j)
3475 {
3476 pa.weightsPerTask.at(i).at(j) = pa.errorsPerTask.at(j)
3477 * numWeightsPerUpdater.at(i);
3478 }
3479 }
3480 }
3481 pa.numErrorsGlobal = 0;
3482 for (size_t i = 0; i < pa.errorsPerTask.size(); ++i)
3483 {
3484 pa.offsetPerTask.push_back(pa.numErrorsGlobal);
3485 pa.numErrorsGlobal += pa.errorsPerTask.at(i);
3486 }
3487 pa.offsetJacobian.resize(numUpdaters);
3488 for (size_t i = 0; i < numUpdaters; ++i)
3489 {
3490 for (size_t j = 0; j < pa.offsetPerTask.size(); ++j)
3491 {
3492 pa.offsetJacobian.at(i).push_back(pa.offsetPerTask.at(j) *
3493 numWeightsPerUpdater.at(i));
3494 }
3495 }
3496 log << "Update plan for property \"" + property + "\":\n";
3497 log << strpr("- Per-task batch size : %zu\n",
3498 pa.taskBatchSize);
3499 log << strpr("- Fraction of patterns used per epoch : %.6f\n",
3500 pa.epochFraction);
3501 if (pa.numUpdates == 0)
3502 {
3503 log << "WARNING: No updates are planned for this property.";
3504 }
3505 log << strpr("- Updates per epoch : %zu\n",
3506 pa.numUpdates);
3507 log << strpr("- Patterns used per update (rank %3d / global) : "
3508 "%10zu / %zu\n",
3510 log << "----------------------------------------------\n";
3511
3512 return;
3513}
3514
3515void Training::allocateArrays(string const& property)
3516{
3517 bool isProperty = (find(pk.begin(), pk.end(), property) != pk.end());
3518 if (!isProperty)
3519 {
3520 throw runtime_error("ERROR: Unknown property for array allocation.\n");
3521 }
3522
3523 log << "Allocating memory for " + property +
3524 " error vector and Jacobian.\n";
3525 Property& pa = p[property];
3526 pa.error.resize(numUpdaters);
3527 pa.jacobian.resize(numUpdaters);
3528 for (size_t i = 0; i < numUpdaters; ++i)
3529 {
3530 size_t size = 1;
3531 if (( parallelMode == PM_TRAIN_ALL ||
3532 (parallelMode == PM_TRAIN_RK0 && myRank == 0)) &&
3534 {
3535 size *= pa.numErrorsGlobal;
3536 }
3537 else if ((parallelMode == PM_TRAIN_RK0 && myRank != 0) &&
3539 {
3540 size *= pa.errorsPerTask.at(myRank);
3541 }
3542 pa.error.at(i).resize(size, 0.0);
3543 pa.jacobian.at(i).resize(size * numWeightsPerUpdater.at(i), 0.0);
3544 log << strpr("Updater %3zu:\n", i);
3545 log << strpr(" - Error size: %zu\n", pa.error.at(i).size());
3546 log << strpr(" - Jacobian size: %zu\n", pa.jacobian.at(i).size());
3547 }
3548 log << "----------------------------------------------\n";
3549
3550 return;
3551}
3552
3553void Training::writeTimingData(bool append, string const fileName)
3554{
3555 ofstream file;
3556 string fileNameActual = fileName;
3558 {
3559 fileNameActual += strpr(".stage-%zu", stage);
3560 }
3561
3562 vector<string> sub = {"_err", "_com", "_upd", "_log"};
3563 if (append) file.open(fileNameActual.c_str(), ofstream::app);
3564 else
3565 {
3566 file.open(fileNameActual.c_str());
3567
3568 // File header.
3569 vector<string> title;
3570 vector<string> colName;
3571 vector<string> colInfo;
3572 vector<size_t> colSize;
3573 title.push_back("Timing data for training loop.");
3574 colSize.push_back(10);
3575 colName.push_back("epoch");
3576 colInfo.push_back("Current epoch.");
3577 colSize.push_back(11);
3578 colName.push_back("train");
3579 colInfo.push_back("Time for training.");
3580 colSize.push_back(7);
3581 colName.push_back("ptrain");
3582 colInfo.push_back("Time for training (percentage of loop).");
3583 colSize.push_back(11);
3584 colName.push_back("error");
3585 colInfo.push_back("Time for error calculation.");
3586 colSize.push_back(7);
3587 colName.push_back("perror");
3588 colInfo.push_back("Time for error calculation (percentage of loop).");
3589 colSize.push_back(11);
3590 colName.push_back("epoch");
3591 colInfo.push_back("Time for this epoch.");
3592 colSize.push_back(11);
3593 colName.push_back("total");
3594 colInfo.push_back("Total time for all epochs.");
3595 for (auto k : pk)
3596 {
3597 colSize.push_back(11);
3598 colName.push_back(p[k].tiny + "train");
3599 colInfo.push_back("");
3600 colSize.push_back(7);
3601 colName.push_back(p[k].tiny + "ptrain");
3602 colInfo.push_back("");
3603 }
3604 for (auto s : sub)
3605 {
3606 for (auto k : pk)
3607 {
3608 colSize.push_back(11);
3609 colName.push_back(p[k].tiny + s);
3610 colInfo.push_back("");
3611 colSize.push_back(7);
3612 colName.push_back(p[k].tiny + "p" + s);
3613 colInfo.push_back("");
3614 }
3615 }
3616 appendLinesToFile(file,
3617 createFileHeader(title, colSize, colName, colInfo));
3618 }
3619
3620 double timeLoop = sw["loop"].getLoop();
3621 file << strpr("%10zu", epoch);
3622 file << strpr(" %11.3E", sw["train"].getLoop());
3623 file << strpr(" %7.3f", sw["train"].getLoop() / timeLoop);
3624 file << strpr(" %11.3E", sw["error"].getLoop());
3625 file << strpr(" %7.3f", sw["error"].getLoop() / timeLoop);
3626 file << strpr(" %11.3E", timeLoop);
3627 file << strpr(" %11.3E", sw["loop"].getTotal());
3628
3629 for (auto k : pk)
3630 {
3631 file << strpr(" %11.3E", sw[k].getLoop());
3632 file << strpr(" %7.3f", sw[k].getLoop() / sw["train"].getLoop());
3633 }
3634 for (auto s : sub)
3635 {
3636 for (auto k : pk)
3637 {
3638 file << strpr(" %11.3E", sw[k + s].getLoop());
3639 file << strpr(" %7.3f", sw[k + s].getLoop() / sw[k].getLoop());
3640 }
3641 }
3642 file << "\n";
3643
3644 file.flush();
3645 file.close();
3646
3647 return;
3648}
3649
3650Training::Property::Property(string const& property) :
3651 property (property ),
3652 displayMetric ("" ),
3653 tiny ("" ),
3654 plural ("" ),
3655 selectionMode (SM_RANDOM),
3656 numTrainPatterns (0 ),
3657 numTestPatterns (0 ),
3658 taskBatchSize (0 ),
3659 writeCompEvery (0 ),
3660 writeCompAlways (0 ),
3661 posUpdateCandidates (0 ),
3662 rmseThresholdTrials (0 ),
3663 countUpdates (0 ),
3664 numUpdates (0 ),
3665 patternsPerUpdate (0 ),
3666 patternsPerUpdateGlobal(0 ),
3667 numErrorsGlobal (0 ),
3668 epochFraction (0.0 ),
3669 rmseThreshold (0.0 )
3670{
3671 if (property == "energy")
3672 {
3673 tiny = "E";
3674 plural = "energies";
3675 errorMetrics = {"RMSEpa", "RMSE", "MAEpa", "MAE"};
3676 }
3677 else if (property == "force")
3678 {
3679 tiny = "F";
3680 plural = "forces";
3681 errorMetrics = {"RMSE", "MAE"};
3682 }
3683 else if (property == "charge")
3684 {
3685 tiny = "Q";
3686 plural = "charges";
3687 errorMetrics = {"RMSE", "MAE"};
3688 }
3689 else
3690 {
3691 throw runtime_error("ERROR: Unknown training property.\n");
3692 }
3693
3694 // Set up error metrics
3695 for (auto m : errorMetrics)
3696 {
3697 errorTrain[m] = 0.0;
3698 errorTest[m] = 0.0;
3699 }
3701}
Collect and process large data sets.
Definition: Dataset.h:35
std::size_t numStructures
Total number of structures in dataset.
Definition: Dataset.h:203
gsl_rng * rng
GSL random number generator (different seed for each MPI process).
Definition: Dataset.h:209
MPI_Comm comm
Global MPI communicator.
Definition: Dataset.h:207
int numProcs
Total number of MPI processors.
Definition: Dataset.h:201
std::vector< Structure > structures
All structures in this dataset.
Definition: Dataset.h:195
void combineFiles(std::string filePrefix) const
Combine individual MPI proc files to one.
Definition: Dataset.cpp:1742
int myRank
My process ID.
Definition: Dataset.h:199
gsl_rng * rngGlobal
Global GSL random number generator (equal seed for each MPI process).
Definition: Dataset.h:211
void collectError(std::string const &property, std::map< std::string, double > &error, std::size_t &count) const
Collect error metrics of a property over all MPI procs.
Definition: Dataset.cpp:1710
std::size_t atomicNumber(std::size_t index) const
Get atomic number from element index.
Definition: ElementMap.h:145
Weight updates based on simple gradient descent methods.
void setParametersFixed(double const eta)
Set parameters for fixed step gradient descent algorithm.
void setParametersAdam(double const eta, double const beta1, double const beta2, double const epsilon)
Set parameters for Adam algorithm.
DescentType
Enumerate different gradient descent variants.
@ DT_ADAM
Adaptive moment estimation (Adam).
@ DT_FIXED
Fixed step size.
Implementation of the Kalman filter method.
Definition: KalmanFilter.h:32
void setParametersStandard(double const epsilon, double const q0, double const qtau, double const qmin, double const eta0, double const etatau, double const etamax)
Set parameters for standard Kalman filter.
void setParametersFadingMemory(double const epsilon, double const q0, double const qtau, double const qmin, double const lambda, double const nu)
Set parameters for fading memory Kalman filter.
KalmanType
Enumerate different Kalman filter types.
Definition: KalmanFilter.h:36
@ KT_STANDARD
Regular Kalman filter.
Definition: KalmanFilter.h:38
@ KT_FADINGMEMORY
Kalman filtering with fading memory modification.
Definition: KalmanFilter.h:40
void setSizeObservation(std::size_t const sizeObservation)
Set observation vector size.
bool silent
Completely silence output.
Definition: Log.h:87
bool normalize
Definition: Mode.h:514
double convEnergy
Definition: Mode.h:523
void setupNormalization(bool standalone=true)
Set up normalization.
Definition: Mode.cpp:218
ElementMap elementMap
Global element map, populated by setupElementMap().
Definition: Mode.h:508
NNPType nnpType
Definition: Mode.h:513
void addEnergyOffset(Structure &structure, bool ref=true)
Add atomic energy offsets to reference energy.
Definition: Mode.cpp:1461
double convLength
Definition: Mode.h:524
virtual void setupSymmetryFunctionCache(bool verbose=false)
Set up symmetry function cache.
Definition: Mode.cpp:804
double maxCutoffRadius
Definition: Mode.h:520
@ SHORT_CHARGE_NN
Short range NNP with additional charge NN (M. Bircher).
@ SHORT_ONLY
Short range NNP only.
void readNeuralNetworkWeights(std::string const &type, std::string const &fileName)
Read in weights for a specific type of neural network.
Definition: Mode.cpp:1699
double meanEnergy
Definition: Mode.h:522
double physical(std::string const &property, double value) const
Undo normalization for a given property.
Definition: Mode.cpp:1549
virtual void setupSymmetryFunctionGroups()
Set up symmetry function groups.
Definition: Mode.cpp:662
std::vector< Element > elements
Definition: Mode.h:528
std::size_t numElements
Definition: Mode.h:517
void calculateAtomicNeuralNetworks(Structure &structure, bool const derivatives)
Calculate a single atomic neural network for a given atom and nn type.
Definition: Mode.cpp:1310
void calculateEnergy(Structure &structure) const
Calculate potential energy for a given structure.
Definition: Mode.cpp:1366
void calculateSymmetryFunctionGroups(Structure &structure, bool const derivatives)
Calculate all symmetry function groups for all atoms in given structure.
Definition: Mode.cpp:1230
virtual void setupSymmetryFunctionScaling(std::string const &fileName="scaling.data")
Set up symmetry function scaling from file.
Definition: Mode.cpp:532
void removeEnergyOffset(Structure &structure, bool ref=true)
Remove atomic energy offsets from reference energy.
Definition: Mode.cpp:1480
Log log
Global log file.
Definition: Mode.h:510
double getEnergyWithOffset(Structure const &structure, bool ref=true) const
Add atomic energy offsets and return energy.
Definition: Mode.cpp:1512
Settings settings
Definition: Mode.h:525
virtual void setupSymmetryFunctions()
Set up all symmetry functions.
Definition: Mode.cpp:435
void calculateSymmetryFunctions(Structure &structure, bool const derivatives)
Calculate all symmetry functions for all atoms in given structure.
Definition: Mode.cpp:1150
void calculateForces(Structure &structure) const
Calculate forces for all atoms in given structure.
Definition: Mode.cpp:1392
void setupSymmetryFunctionStatistics(bool collectStatistics, bool collectExtrapolationWarnings, bool writeExtrapolationWarnings, bool stopOnExtrapolationWarnings)
Set up symmetry function statistics collection.
Definition: Mode.cpp:923
void setupSymmetryFunctionMemory(bool verbose=false)
Extract required memory dimensions for symmetry function derivatives.
Definition: Mode.cpp:714
void writeSettingsFile(std::ofstream *const &file) const
Write complete settings file.
Definition: Mode.cpp:1650
This class implements a feed-forward neural network.
Definition: NeuralNetwork.h:29
int getNumConnections() const
Return total number of connections.
void setInput(double const *const &input) const
Set neural network input layer node values.
void modifyConnections(ModificationScheme modificationScheme)
Change connections according to a given modification scheme.
void setConnections(double const *const &connections)
Set neural network weights and biases.
void calculateDFdc(double *dFdc, double const *const &dGdxyz) const
Calculate "second" derivative of output with respect to connections.
@ MS_ZEROBIAS
Set all bias values to zero.
Definition: NeuralNetwork.h:60
@ MS_GLOROTBENGIO
Normalize connections according to Glorot and Bengio.
Definition: NeuralNetwork.h:88
@ MS_NGUYENWIDROW
Initialize connections according to Nguyen-Widrow scheme.
void getConnections(double *connections) const
Get neural network weights and biases.
void propagate()
Propagate input information through all layers.
void calculateDEdc(double *dEdc) const
Calculate derivative of output neuron with respect to connections.
void calculateDEdG(double *dEdG) const
Calculate derivative of output neuron with respect to input neurons.
void getOutput(double *output) const
Get neural network output layer node values.
void writeSettingsFile(std::ofstream *const &file, std::map< std::size_t, std::string > const &replacements={}) const
Write complete settings file.
Definition: Settings.cpp:274
bool keywordExists(std::string const &keyword, bool exact=false) const
Check if keyword is present in settings file.
Definition: Settings.cpp:177
std::vector< std::string > getSettingsLines() const
Get complete settings file.
Definition: Settings.cpp:244
void setTrainingLogFileName(std::string fileName)
Set training log file name.
Definition: Training.cpp:2842
UpdaterType updaterType
Updater type used.
Definition: Training.h:459
std::vector< double > dGdxia
Derivative of symmetry functions with respect to one specific atom coordinate.
Definition: Training.h:517
std::size_t epoch
Current epoch.
Definition: Training.h:485
std::vector< std::size_t > numWeightsPerUpdater
Number of weights per updater.
Definition: Training.h:509
std::size_t writeWeightsAlways
Up to this epoch weights are written every epoch.
Definition: Training.h:489
void randomizeNeuralNetworkWeights(std::string const &type)
Randomly initialize specificy neural network weights.
Definition: Training.cpp:3176
std::size_t numWeights
Total number of weights.
Definition: Training.h:497
bool useForces
Use forces for training.
Definition: Training.h:471
std::size_t writeWeightsEvery
Write weights every this many epochs.
Definition: Training.h:487
void setupTraining()
General training settings and setup of weight update routine.
Definition: Training.cpp:703
std::mt19937_64 rngGlobalNew
Global random number generator.
Definition: Training.h:530
std::string nnId
ID of neural network the training is working on.
Definition: Training.h:503
void allocateArrays(std::string const &property)
Allocate error and Jacobian arrays for given property.
Definition: Training.cpp:3515
void writeWeightsEpoch() const
Write weights to files during training loop.
Definition: Training.cpp:1576
std::ofstream trainingLog
Training log file.
Definition: Training.h:505
bool hasUpdaters
If this rank performs weight updates.
Definition: Training.h:467
void checkSelectionMode()
Check if selection mode should be changed.
Definition: Training.cpp:1961
std::string trainingLogFileName
File name for training log.
Definition: Training.h:501
JacobianMode jacobianMode
Jacobian mode used.
Definition: Training.h:463
void selectSets()
Randomly select training and test set structures.
Definition: Training.cpp:81
JacobianMode
Jacobian matrix preparation mode.
Definition: Training.h:85
@ JM_SUM
No Jacobian, sum up contributions from update candidates.
Definition: Training.h:87
@ JM_TASK
Prepare one Jacobian entry for each task, sum up within tasks.
Definition: Training.h:89
@ JM_FULL
Prepare full Jacobian matrix.
Definition: Training.h:91
std::size_t countUpdates
Update counter (for all training quantities together).
Definition: Training.h:495
std::map< std::string, Stopwatch > sw
Stopwatches for timing overview.
Definition: Training.h:526
void setupSelectionMode(std::string const &property)
Set selection mode for specific training property.
Definition: Training.cpp:3235
void dPdc(std::string property, Structure &structure, std::vector< std::vector< double > > &dEdc)
Compute derivatives of property with respect to weights.
Definition: Training.cpp:2884
void printHeader()
Print training loop header on screen.
Definition: Training.cpp:1455
std::mt19937_64 rngNew
Per-task random number generator.
Definition: Training.h:528
PropertyMap p
Actual training properties.
Definition: Training.h:532
std::vector< Updater * > updaters
Weight updater (combined or for each element).
Definition: Training.h:523
std::size_t writeNeuronStatisticsAlways
Up to this epoch neuron statistics are written every epoch.
Definition: Training.h:493
UpdaterType
Type of update routine.
Definition: Training.h:40
@ UT_GD
Simple gradient descent methods.
Definition: Training.h:42
@ UT_KF
Kalman filter-based methods.
Definition: Training.h:44
@ UT_LM
Levenberg-Marquardt algorithm.
Definition: Training.h:46
double forceWeight
Force update weight.
Definition: Training.h:499
void update(std::string const &property)
Perform one update.
Definition: Training.cpp:2096
void dataSetNormalization()
Apply normalization based on initial weights prediction.
Definition: Training.cpp:398
ParallelMode
Training parallelization mode.
Definition: Training.h:56
@ PM_TRAIN_RK0
No training parallelization, only data set distribution.
Definition: Training.h:71
@ PM_TRAIN_ALL
Parallel gradient computation, update on each task.
Definition: Training.h:80
bool writeTrainingLog
Whether training log file is written.
Definition: Training.h:477
std::size_t numUpdaters
Number of updaters (depends on update strategy).
Definition: Training.h:481
bool advance() const
Check if training loop should be continued.
Definition: Training.cpp:3038
~Training()
Destructor, updater vector needs to be cleaned.
Definition: Training.cpp:63
void sortUpdateCandidates(std::string const &property)
Sort update candidates with descending RMSE.
Definition: Training.cpp:1892
void writeNeuronStatisticsEpoch() const
Write neuron statistics during training loop.
Definition: Training.cpp:1807
SelectionMode
How update candidates are selected during Training.
Definition: Training.h:105
@ SM_RANDOM
Select candidates randomly.
Definition: Training.h:107
@ SM_THRESHOLD
Select candidates randomly with RMSE above threshold.
Definition: Training.h:111
@ SM_SORT
Sort candidates according to their RMSE and pick worst first.
Definition: Training.h:109
std::size_t stage
Training stage.
Definition: Training.h:479
void collectDGdxia(Atom const &atom, std::size_t indexAtom, std::size_t indexComponent)
Collect derivative of symmetry functions with repect to one atom's coordinate.
Definition: Training.cpp:3138
void setWeights()
Set weights in neural network class.
Definition: Training.cpp:3068
void loop()
Execute main training loop.
Definition: Training.cpp:1994
void printEpoch()
Print preferred error metric and timing information on screen.
Definition: Training.cpp:1513
std::vector< std::size_t > weightsOffset
Offset of each element's weights in combined array.
Definition: Training.h:511
std::size_t numEpochs
Number of epochs requested.
Definition: Training.h:483
void setupFileOutput(std::string const &type)
Set file output intervals for properties and other quantities.
Definition: Training.cpp:3343
std::size_t getNumConnections(std::string id="short") const
Get total number of NN connections.
Definition: Training.cpp:2849
void addTrainingLogEntry(int proc, std::size_t il, double f, std::size_t isg, std::size_t is)
Write energy update data to training log file.
Definition: Training.cpp:3093
void writeTimingData(bool append, std::string const fileName="timing.out")
Write timing data for all clocks.
Definition: Training.cpp:3553
std::size_t writeNeuronStatisticsEvery
Write neuron statistics every this many epochs.
Definition: Training.h:491
void getWeights()
Get weights from neural network class.
Definition: Training.cpp:3044
void dPdcN(std::string property, Structure &structure, std::vector< std::vector< double > > &dEdc, double delta=1.0E-4)
Compute numeric derivatives of property with respect to weights.
Definition: Training.cpp:2956
std::vector< std::string > setupNumericDerivCheck()
Set up numeric weight derivatives check.
Definition: Training.cpp:1170
void setSingleWeight(std::size_t element, std::size_t index, double value)
Set a single weight value.
Definition: Training.cpp:2744
void shuffleUpdateCandidates(std::string const &property)
Shuffle update candidates.
Definition: Training.cpp:1922
double getSingleWeight(std::size_t element, std::size_t index)
Get a single weight value.
Definition: Training.cpp:2737
void initializeWeights()
Initialize weights for all elements.
Definition: Training.cpp:259
void initializeWeightsMemory(UpdateStrategy updateStrategy=US_COMBINED)
Initialize weights vector according to update strategy.
Definition: Training.cpp:314
bool repeatedEnergyUpdates
After force update perform energy update for corresponding structure.
Definition: Training.h:473
UpdateStrategy updateStrategy
Update strategy used.
Definition: Training.h:465
void calculateNeighborLists()
Calculate neighbor lists for all structures.
Definition: Training.cpp:1206
ParallelMode parallelMode
Parallelization mode used.
Definition: Training.h:461
void writeNeuronStatistics(std::string const &nnName, std::string const &fileName) const
Write neuron statistics collected since last invocation.
Definition: Training.cpp:1708
std::vector< std::size_t > getConnectionOffsets(std::string id="short") const
Get offsets of NN connections for each element.
Definition: Training.cpp:2871
std::vector< int > epochSchedule
Update schedule epoch (false = energy update, true = force update).
Definition: Training.h:507
void writeUpdaterStatus(bool append, std::string const fileNameFormat="updater.%03zu.out") const
Write updater information to file.
Definition: Training.cpp:1857
UpdateStrategy
Update strategies available for Training.
Definition: Training.h:96
@ US_COMBINED
One combined updater for all elements.
Definition: Training.h:98
@ US_ELEMENT
Separate updaters for individual elements.
Definition: Training.h:100
std::vector< std::size_t > getNumConnectionsPerElement(std::string id="short") const
Get number of NN connections for each element.
Definition: Training.cpp:2860
void resetNeuronStatistics()
Reset neuron statistics for all elements.
Definition: Training.cpp:1847
std::vector< std::vector< double > > calculateWeightDerivatives(Structure *structure)
Calculate derivatives of energy with respect to weights.
Definition: Training.cpp:2753
void calculateError(std::map< std::string, std::pair< std::string, std::string > > const fileNames)
Calculate error metrics for all structures.
Definition: Training.cpp:1246
void writeLearningCurve(bool append, std::string const fileName="learning-curve.out") const
Write current RMSEs and epoch information to file.
Definition: Training.cpp:1595
void calculateErrorEpoch()
Calculate error metrics per epoch for all structures with file names used in training loop.
Definition: Training.cpp:1425
std::vector< std::string > pk
Vector of actually used training properties.
Definition: Training.h:513
void setEpochSchedule()
Select energies/forces schedule for one epoch.
Definition: Training.cpp:1933
bool hasStructures
If this rank holds structure information.
Definition: Training.h:469
std::vector< std::vector< double > > weights
Neural network weights and biases for each element.
Definition: Training.h:521
bool freeMemory
Free symmetry function memory after calculation.
Definition: Training.h:475
void writeSetsToFiles()
Write training and test set to separate files (train.data and test.data, same format as input....
Definition: Training.cpp:197
void writeWeights(std::string const &nnName, std::string const &fileNameFormat) const
Write weights to files (one file for each element).
Definition: Training.cpp:1559
void setStage(std::size_t stage)
Set training stage (if multiple stages are needed for NNP type).
Definition: Training.cpp:361
void setupUpdatePlan(std::string const &property)
Set up how often properties are updated.
Definition: Training.cpp:3405
Base class for different weight update methods.
Definition: Updater.h:32
#define MPI_SIZE_T
Definition: mpi-extra.h:22
Definition: Atom.h:28
string strpr(const char *format,...)
String version of printf function.
Definition: utility.cpp:90
vector< string > createFileHeader(vector< string > const &title, vector< size_t > const &colSize, vector< string > const &colName, vector< string > const &colInfo, char const &commentChar)
Definition: utility.cpp:104
vector< string > split(string const &input, char delimiter)
Split string at each delimiter.
Definition: utility.cpp:33
string reduce(string const &line, string const &whitespace, string const &fill)
Replace multiple whitespaces with fill.
Definition: utility.cpp:60
void appendLinesToFile(ofstream &file, vector< string > const lines)
Append multiple lines of strings to open file stream.
Definition: utility.cpp:218
double d
Definition: nnp-cutoff.cpp:34
Struct to store information on neighbor atoms.
Definition: Atom.h:35
std::size_t element
Element index of neighbor atom.
Definition: Atom.h:41
std::vector< Vec3D > dGdr
Derivatives of symmetry functions with respect to neighbor coordinates.
Definition: Atom.h:59
Storage for a single atom.
Definition: Atom.h:32
std::vector< Neighbor > neighbors
Neighbor array (maximum number defined in macros.h.
Definition: Atom.h:148
std::size_t numSymmetryFunctions
Number of symmetry functions used to describe the atom environment.
Definition: Atom.h:110
Vec3D f
Force vector calculated by neural network.
Definition: Atom.h:120
double charge
Atomic charge determined by neural network.
Definition: Atom.h:114
std::size_t index
Index number of this atom.
Definition: Atom.h:98
Vec3D fRef
Reference force vector from data set.
Definition: Atom.h:122
std::vector< Vec3D > dGdr
Derivative of symmetry functions with respect to this atom's coordinates.
Definition: Atom.h:146
std::size_t element
Element index of this atom.
Definition: Atom.h:104
std::vector< double > G
Symmetry function values.
Definition: Atom.h:134
double chargeRef
Atomic reference charge.
Definition: Atom.h:116
std::vector< std::size_t > numNeighborsPerElement
Number of neighbors per element.
Definition: Atom.h:126
std::size_t numNeighbors
Total number of neighbors.
Definition: Atom.h:106
Storage for one atomic configuration.
Definition: Structure.h:34
@ ST_TRAINING
Structure is part of the training set.
Definition: Structure.h:44
@ ST_UNKNOWN
Sample type not assigned yet.
Definition: Structure.h:41
@ ST_TEST
Structure is part of the test set.
Definition: Structure.h:50
std::vector< std::size_t > numAtomsPerElement
Number of atoms of each element in this structure.
Definition: Structure.h:94
std::size_t index
Index number of this structure.
Definition: Structure.h:66
SampleType sampleType
Sample type (training or test set).
Definition: Structure.h:86
void freeAtoms(bool all)
Free symmetry function memory for all atoms, see free() in Atom class.
Definition: Structure.cpp:531
double energy
Potential energy determined by neural network.
Definition: Structure.h:76
double energyRef
Reference potential energy.
Definition: Structure.h:78
std::size_t numAtoms
Total number of atoms present in this structure.
Definition: Structure.h:68
std::vector< Atom > atoms
Vector of all atoms in this structure.
Definition: Structure.h:96
bool exists(std::string const &key)
Check if property is present.
Definition: Training.h:452
Specific training quantity (e.g. energies, forces, charges).
Definition: Training.h:368
std::size_t numErrorsGlobal
Global number of errors per update.
Definition: Training.h:405
std::size_t numTrainPatterns
Number of training patterns in set.
Definition: Training.h:383
std::vector< int > errorsPerTask
Errors per task for each update.
Definition: Training.h:411
std::string property
Copy of identifier within Property map.
Definition: Training.h:373
SelectionMode selectionMode
Selection mode for update candidates.
Definition: Training.h:381
std::size_t numUpdates
Number of desired updates per epoch.
Definition: Training.h:399
double rmseThreshold
RMSE threshold for update candidates.
Definition: Training.h:409
std::string plural
Plural string of property;.
Definition: Training.h:379
double epochFraction
Desired update fraction per epoch.
Definition: Training.h:407
std::size_t patternsPerUpdate
Patterns used per update.
Definition: Training.h:401
std::size_t taskBatchSize
Batch size for each MPI task.
Definition: Training.h:387
std::vector< std::vector< double > > error
Global error vector (per updater).
Definition: Training.h:432
std::string displayMetric
Error metric for display.
Definition: Training.h:375
std::string tiny
Tiny abbreviation string for property.
Definition: Training.h:377
std::vector< std::string > errorMetrics
Error metrics available for this property.
Definition: Training.h:415
Property(std::string const &property)
Constructor.
Definition: Training.cpp:3650
std::vector< std::vector< int > > offsetJacobian
Stride for Jacobians per task per updater.
Definition: Training.h:429
std::size_t countUpdates
Counter for updates per epoch.
Definition: Training.h:397
std::vector< std::vector< int > > weightsPerTask
Weights per task per updater.
Definition: Training.h:426
std::vector< int > offsetPerTask
Offset for combined error per task.
Definition: Training.h:413
std::vector< UpdateCandidate > updateCandidates
Vector with indices of training patterns.
Definition: Training.h:423
std::map< std::string, double > errorTrain
Current error metrics of training patterns.
Definition: Training.h:418
std::vector< std::vector< double > > jacobian
Global Jacobian (per updater).
Definition: Training.h:435
std::size_t patternsPerUpdateGlobal
Patterns used per update (summed over all MPI tasks).
Definition: Training.h:403
std::map< std::string, double > errorTest
Current error metrics of test patterns.
Definition: Training.h:421
std::size_t rmseThresholdTrials
Maximum trials for SM_THRESHOLD selection mode.
Definition: Training.h:395
std::size_t posUpdateCandidates
Current position in update candidate list (SM_SORT).
Definition: Training.h:393
Contains location of one update candidate (energy or force).
Definition: Training.h:350
std::size_t a
Atom index (only used for force candidates).
Definition: Training.h:354
std::size_t c
Component index (x,y,z -> 0,1,2, only used for force candidates).
Definition: Training.h:356
std::size_t s
Structure index.
Definition: Training.h:352