RDKit
Open-source cheminformatics and machine learning.
Embedder.h
Go to the documentation of this file.
1//
2// Copyright (C) 2004-2017 Greg Landrum and Rational Discovery LLC
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11#include <RDGeneral/export.h>
12#ifndef RD_EMBEDDER_H_GUARD
13#define RD_EMBEDDER_H_GUARD
14
15#include <map>
16#include <utility>
17#include <Geometry/point.h>
18#include <GraphMol/ROMol.h>
19#include <boost/shared_ptr.hpp>
21
22namespace RDKit {
23namespace DGeomHelpers {
24
36};
37
38//! Parameter object for controlling embedding
39/*!
40 numConfs Number of conformations to be generated
41 numThreads Sets the number of threads to use (more than one thread
42 will only be used if the RDKit was build with multithread
43 support) If set to zero, the max supported by the system will
44 be used.
45 maxIterations Max. number of times the embedding will be tried if
46 coordinates are not obtained successfully. The default
47 value is 10x the number of atoms.
48 randomSeed provides a seed for the random number generator (so that
49 the same coordinates can be obtained for a
50 molecule on multiple runs) If -1, the
51 RNG will not be seeded.
52 clearConfs Clear all existing conformations on the molecule
53 useRandomCoords Start the embedding from random coordinates instead of
54 using eigenvalues of the distance matrix.
55 boxSizeMult Determines the size of the box that is used for
56 random coordinates. If this is a positive number, the
57 side length will equal the largest element of the distance
58 matrix times \c boxSizeMult. If this is a negative number,
59 the side length will equal \c -boxSizeMult (i.e. independent
60 of the elements of the distance matrix).
61 randNegEig Picks coordinates at random when a embedding process produces
62 negative eigenvalues
63 numZeroFail Fail embedding if we find this many or more zero eigenvalues
64 (within a tolerance)
65 pruneRmsThresh Retain only the conformations out of 'numConfs' after
66 embedding that are at least this far apart from each other.
67 RMSD is computed on the heavy atoms.
68 Prunining is greedy; i.e. the first embedded conformation is
69 retained and from then on only those that are at least
70 \c pruneRmsThresh away from already
71 retained conformations are kept. The pruning is done
72 after embedding and bounds violation minimization.
73 No pruning by default.
74 coordMap a map of int to Point3D, between atom IDs and their locations
75 their locations. If this container is provided, the
76 coordinates are used to set distance constraints on the
77 embedding. The resulting conformer(s) should have distances
78 between the specified atoms that reproduce those between the
79 points in \c coordMap. Because the embedding produces a
80 molecule in an arbitrary reference frame, an alignment step
81 is required to actually reproduce the provided coordinates.
82 optimizerForceTol set the tolerance on forces in the DGeom optimizer
83 (this shouldn't normally be altered in client code).
84 ignoreSmoothingFailures try to embed the molecule even if triangle bounds
85 smoothing fails
86 enforceChirality enforce the correct chirality if chiral centers are present
87 useExpTorsionAnglePrefs impose experimental torsion-angle preferences
88 useBasicKnowledge impose "basic knowledge" terms such as flat
89 aromatic rings, ketones, etc.
90 ETversion version of the experimental torsion-angle preferences
91 verbose print output of experimental torsion-angle preferences
92 basinThresh set the basin threshold for the DGeom force field,
93 (this shouldn't normally be altered in client code).
94 onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
95 boundsMat custom bound matrix to specify upper and lower bounds of atom
96 pairs
97 embedFragmentsSeparately embed each fragment of molecule in turn
98 useSmallRingTorsions optional torsions to improve small ring conformer
99 sampling
100 useMacrocycleTorsions optional torsions to improve macrocycle conformer
101 sampling
102 useMacrocycle14config If 1-4 distances bound heuristics for
103 macrocycles is used
104 CPCI custom columbic interactions between atom pairs
105 callback void pointer to a function for reporting progress,
106 will be called with the current iteration number.
107 forceTransAmides constrain amide bonds to be trans.
108 useSymmetryForPruning use molecule symmetry when doing the RMSD pruning.
109 NOTE that for reasons of computational efficiency,
110 setting this will also set onlyHeavyAtomsForRMS to
111 true.
112 trackFailures keep track of which checks during the embedding process fail
113 failures if trackFailures is true, this is used to track the number
114 of times each embedding check fails
115*/
117 unsigned int maxIterations{0};
118 int numThreads{1};
119 int randomSeed{-1};
120 bool clearConfs{true};
121 bool useRandomCoords{false};
122 double boxSizeMult{2.0};
123 bool randNegEig{true};
124 unsigned int numZeroFail{1};
125 const std::map<int, RDGeom::Point3D> *coordMap{nullptr};
126 double optimizerForceTol{1e-3};
127 bool ignoreSmoothingFailures{false};
128 bool enforceChirality{true};
129 bool useExpTorsionAnglePrefs{false};
130 bool useBasicKnowledge{false};
131 bool verbose{false};
132 double basinThresh{5.0};
133 double pruneRmsThresh{-1.0};
134 bool onlyHeavyAtomsForRMS{false};
135 unsigned int ETversion{1};
136 boost::shared_ptr<const DistGeom::BoundsMatrix> boundsMat;
137 bool embedFragmentsSeparately{true};
138 bool useSmallRingTorsions{false};
139 bool useMacrocycleTorsions{false};
140 bool useMacrocycle14config{false};
141 std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>> CPCI;
142 void (*callback)(unsigned int);
143 bool forceTransAmides{true};
144 bool useSymmetryForPruning{true};
145 double boundsMatForceScaling{1.0};
146 bool trackFailures{false};
147 std::vector<unsigned int> failures;
148
149 EmbedParameters() : boundsMat(nullptr), CPCI(nullptr), callback(nullptr) {}
151 unsigned int maxIterations, int numThreads, int randomSeed,
152 bool clearConfs, bool useRandomCoords, double boxSizeMult,
153 bool randNegEig, unsigned int numZeroFail,
154 const std::map<int, RDGeom::Point3D> *coordMap, double optimizerForceTol,
155 bool ignoreSmoothingFailures, bool enforceChirality,
156 bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose,
157 double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS,
158 unsigned int ETversion = 1,
159 const DistGeom::BoundsMatrix *boundsMat = nullptr,
160 bool embedFragmentsSeparately = true, bool useSmallRingTorsions = false,
161 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false,
162 std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>>
163 CPCI = nullptr,
164 void (*callback)(unsigned int) = nullptr)
165 : maxIterations(maxIterations),
166 numThreads(numThreads),
167 randomSeed(randomSeed),
168 clearConfs(clearConfs),
169 useRandomCoords(useRandomCoords),
170 boxSizeMult(boxSizeMult),
171 randNegEig(randNegEig),
172 numZeroFail(numZeroFail),
173 coordMap(coordMap),
174 optimizerForceTol(optimizerForceTol),
175 ignoreSmoothingFailures(ignoreSmoothingFailures),
176 enforceChirality(enforceChirality),
177 useExpTorsionAnglePrefs(useExpTorsionAnglePrefs),
178 useBasicKnowledge(useBasicKnowledge),
179 verbose(verbose),
180 basinThresh(basinThresh),
181 pruneRmsThresh(pruneRmsThresh),
182 onlyHeavyAtomsForRMS(onlyHeavyAtomsForRMS),
183 ETversion(ETversion),
184 boundsMat(boundsMat),
185 embedFragmentsSeparately(embedFragmentsSeparately),
186 useSmallRingTorsions(useSmallRingTorsions),
187 useMacrocycleTorsions(useMacrocycleTorsions),
188 useMacrocycle14config(useMacrocycle14config),
189 CPCI(std::move(CPCI)),
190 callback(callback) {}
191};
192
193//! update parameters from a JSON string
195 EmbedParameters &params, const std::string &json);
196
197//! Embed multiple conformations for a molecule
199 unsigned int numConfs,
200 EmbedParameters &params);
201inline INT_VECT EmbedMultipleConfs(ROMol &mol, unsigned int numConfs,
202 EmbedParameters &params) {
203 INT_VECT res;
204 EmbedMultipleConfs(mol, res, numConfs, params);
205 return res;
206}
207
208//! Compute an embedding (in 3D) for the specified molecule using Distance
209/// Geometry
210inline int EmbedMolecule(ROMol &mol, EmbedParameters &params) {
211 INT_VECT confIds;
212 EmbedMultipleConfs(mol, confIds, 1, params);
213
214 int res;
215 if (confIds.size()) {
216 res = confIds[0];
217 } else {
218 res = -1;
219 }
220 return res;
221}
222
223//! Compute an embedding (in 3D) for the specified molecule using Distance
224/// Geometry
225/*!
226 The following operations are performed (in order) here:
227 -# Build a distance bounds matrix based on the topology, including 1-5
228 distances but not VDW scaling
229 -# Triangle smooth this bounds matrix
230 -# If step 2 fails - repeat step 1, this time without 1-5 bounds and with vdW
231 scaling, and repeat step 2
232 -# Pick a distance matrix at random using the bounds matrix
233 -# Compute initial coordinates from the distance matrix
234 -# Repeat steps 3 and 4 until maxIterations is reached or embedding is
235 successful
236 -# Adjust initial coordinates by minimizing a Distance Violation error
237 function
238 **NOTE**: if the molecule has multiple fragments, they will be embedded
239 separately,
240 this means that they will likely occupy the same region of space.
241 \param mol Molecule of interest
242 \param maxIterations Max. number of times the embedding will be tried if
243 coordinates are not obtained successfully. The default
244 value is 10x the number of atoms.
245 \param seed provides a seed for the random number generator (so that
246 the same coordinates can be obtained for a molecule on
247 multiple runs). If negative, the RNG will not be seeded.
248 \param clearConfs Clear all existing conformations on the molecule
249 \param useRandomCoords Start the embedding from random coordinates instead of
250 using eigenvalues of the distance matrix.
251 \param boxSizeMult Determines the size of the box that is used for
252 random coordinates. If this is a positive number, the
253 side length will equal the largest element of the
254 distance matrix times \c boxSizeMult. If this is a
255 negative number, the side length will equal
256 \c -boxSizeMult (i.e. independent of the elements of the
257 distance matrix).
258 \param randNegEig Picks coordinates at random when a embedding process
259 produces negative eigenvalues
260 \param numZeroFail Fail embedding if we find this many or more zero
261 eigenvalues (within a tolerance)
262 \param coordMap a map of int to Point3D, between atom IDs and their locations
263 their locations. If this container is provided, the
264 coordinates are used to set distance constraints on the
265 embedding. The resulting conformer(s) should have distances
266 between the specified atoms that reproduce those between the
267 points in \c coordMap. Because the embedding produces a
268 molecule in an arbitrary reference frame, an alignment step
269 is required to actually reproduce the provided coordinates.
270 \param optimizerForceTol set the tolerance on forces in the distgeom optimizer
271 (this shouldn't normally be altered in client code).
272 \param ignoreSmoothingFailures try to embed the molecule even if triangle
273 bounds smoothing fails
274 \param enforceChirality enforce the correct chirality if chiral centers are
275 present
276 \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
277 \param useBasicKnowledge impose "basic knowledge" terms such as flat
278 aromatic rings, ketones, etc.
279 \param verbose print output of experimental torsion-angle preferences
280 \param basinThresh set the basin threshold for the DGeom force field,
281 (this shouldn't normally be altered in client code).
282 \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
283 \param ETversion version of torsion preferences to use
284 \param useSmallRingTorsions optional torsions to improve small ring
285 conformer sampling
286
287 \param useMacrocycleTorsions optional torsions to improve macrocycle
288 conformer sampling \param useMacrocycle14config If 1-4 distances bound
289 heuristics for macrocycles is used \return ID of the conformations added to
290 the molecule, -1 if the emdedding failed
291*/
292inline int EmbedMolecule(
293 ROMol &mol, unsigned int maxIterations = 0, int seed = -1,
294 bool clearConfs = true, bool useRandomCoords = false,
295 double boxSizeMult = 2.0, bool randNegEig = true,
296 unsigned int numZeroFail = 1,
297 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
298 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
299 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
300 bool useBasicKnowledge = false, bool verbose = false,
301 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
302 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
303 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
304 EmbedParameters params(
305 maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
306 randNegEig, numZeroFail, coordMap, optimizerForceTol,
307 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
308 useBasicKnowledge, verbose, basinThresh, -1.0, onlyHeavyAtomsForRMS,
309 ETversion, nullptr, true, useSmallRingTorsions, useMacrocycleTorsions,
310 useMacrocycle14config);
311 return EmbedMolecule(mol, params);
312};
313
314//*! Embed multiple conformations for a molecule
315/*!
316 This is kind of equivalent to calling EmbedMolecule multiple times - just that
317 the bounds
318 matrix is computed only once from the topology
319 **NOTE**: if the molecule has multiple fragments, they will be embedded
320 separately,
321 this means that they will likely occupy the same region of space.
322 \param mol Molecule of interest
323 \param res Used to return the resulting conformer ids
324 \param numConfs Number of conformations to be generated
325 \param numThreads Sets the number of threads to use (more than one thread
326 will only be used if the RDKit was build with
327 multithread
328 support). If set to zero, the max supported by the
329 system
330 will be used.
331 \param maxIterations Max. number of times the embedding will be tried if
332 coordinates are not obtained successfully. The default
333 value is 10x the number of atoms.
334 \param seed provides a seed for the random number generator (so that
335 the same coordinates can be obtained for a molecule on
336 multiple runs). If negative, the RNG will not be seeded.
337 \param clearConfs Clear all existing conformations on the molecule
338 \param useRandomCoords Start the embedding from random coordinates instead of
339 using eigenvalues of the distance matrix.
340 \param boxSizeMult Determines the size of the box that is used for
341 random coordinates. If this is a positive number, the
342 side length will equal the largest element of the
343 distance matrix times \c boxSizeMult. If this is a
344 negative number, the side length will equal
345 \c -boxSizeMult (i.e. independent of the elements of the
346 distance matrix).
347 \param randNegEig Picks coordinates at random when a embedding process
348 produces negative eigenvalues
349 \param numZeroFail Fail embedding if we find this many or more zero
350 eigenvalues (within a tolerance)
351 \param pruneRmsThresh Retain only the conformations out of 'numConfs' after
352 embedding that are at least this far apart from each
353 other. RMSD is computed on the heavy atoms.
354 Pruning is greedy; i.e. the first embedded conformation
355 is retained and from then on only those that are at
356 least
357 pruneRmsThresh away from already retained conformations
358 are kept. The pruning is done after embedding and
359 bounds violation minimization. No pruning by default.
360 \param coordMap a map of int to Point3D, between atom IDs and their locations
361 their locations. If this container is provided, the
362 coordinates are used to set distance constraints on the
363 embedding. The resulting conformer(s) should have distances
364 between the specified atoms that reproduce those between the
365 points in \c coordMap. Because the embedding produces a
366 molecule in an arbitrary reference frame, an alignment step
367 is required to actually reproduce the provided coordinates.
368 \param optimizerForceTol set the tolerance on forces in the DGeom optimizer
369 (this shouldn't normally be altered in client code).
370 \param ignoreSmoothingFailures try to embed the molecule even if triangle
371 bounds smoothing fails
372 \param enforceChirality enforce the correct chirality if chiral centers are
373 present
374 \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
375 \param useBasicKnowledge impose "basic knowledge" terms such as flat
376 aromatic rings, ketones, etc.
377 \param verbose print output of experimental torsion-angle preferences
378 \param basinThresh set the basin threshold for the DGeom force field,
379 (this shouldn't normally be altered in client code).
380 \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
381 \param ETversion version of torsion preferences to use
382 \param useSmallRingTorsions optional torsions to improve small ring
383 conformer sampling
384
385 \param useMacrocycleTorsions optional torsions to improve macrocycle
386 conformer sampling \param useMacrocycle14config If 1-4 distances bound
387 heuristics for macrocycles is used
388
389*/
391 ROMol &mol, INT_VECT &res, unsigned int numConfs = 10, int numThreads = 1,
392 unsigned int maxIterations = 30, int seed = -1, bool clearConfs = true,
393 bool useRandomCoords = false, double boxSizeMult = 2.0,
394 bool randNegEig = true, unsigned int numZeroFail = 1,
395 double pruneRmsThresh = -1.0,
396 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
397 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
398 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
399 bool useBasicKnowledge = false, bool verbose = false,
400 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
401 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
402 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
403 EmbedParameters params(
404 maxIterations, numThreads, seed, clearConfs, useRandomCoords, boxSizeMult,
405 randNegEig, numZeroFail, coordMap, optimizerForceTol,
406 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
407 useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
408 onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
409 useMacrocycleTorsions, useMacrocycle14config);
410 EmbedMultipleConfs(mol, res, numConfs, params);
411};
412//! \overload
414 ROMol &mol, unsigned int numConfs = 10, unsigned int maxIterations = 30,
415 int seed = -1, bool clearConfs = true, bool useRandomCoords = false,
416 double boxSizeMult = 2.0, bool randNegEig = true,
417 unsigned int numZeroFail = 1, double pruneRmsThresh = -1.0,
418 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
419 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
420 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
421 bool useBasicKnowledge = false, bool verbose = false,
422 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
423 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
424 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
425 EmbedParameters params(
426 maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
427 randNegEig, numZeroFail, coordMap, optimizerForceTol,
428 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
429 useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
430 onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
431 useMacrocycleTorsions, useMacrocycle14config);
432 INT_VECT res;
433 EmbedMultipleConfs(mol, res, numConfs, params);
434 return res;
435};
436
437//! Parameters corresponding to Sereina Riniker's KDG approach
438RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters KDG;
439//! Parameters corresponding to Sereina Riniker's ETDG approach
440RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETDG;
441//! Parameters corresponding to Sereina Riniker's ETKDG approach
442RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDG;
443//! Parameters corresponding to Sereina Riniker's ETKDG approach - version 2
444RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv2;
445//! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
446//! (10.1021/acs.jcim.0c00025) - the macrocycle part
447RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv3;
448//! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
449//! (10.1021/acs.jcim.0c00025) - the small ring part
450RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters srETKDGv3;
451} // namespace DGeomHelpers
452} // namespace RDKit
453
454#endif
Defines the primary molecule class ROMol as well as associated typedefs.
Class to store the distance bound.
Definition: BoundsMatrix.h:28
#define RDKIT_DISTGEOMHELPERS_EXPORT
Definition: export.h:121
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv2
Parameters corresponding to Sereina Riniker's ETKDG approach - version 2.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETDG
Parameters corresponding to Sereina Riniker's ETDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT void updateEmbedParametersFromJSON(EmbedParameters &params, const std::string &json)
update parameters from a JSON string
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDG
Parameters corresponding to Sereina Riniker's ETKDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT void EmbedMultipleConfs(ROMol &mol, INT_VECT &res, unsigned int numConfs, EmbedParameters &params)
Embed multiple conformations for a molecule.
int EmbedMolecule(ROMol &mol, EmbedParameters &params)
Definition: Embedder.h:210
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters srETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters KDG
Parameters corresponding to Sereina Riniker's KDG approach.
const uint32_t seed
Definition: MHFP.h:29
Std stuff.
Definition: Abbreviations.h:19
std::vector< int > INT_VECT
Definition: types.h:279
Parameter object for controlling embedding.
Definition: Embedder.h:116
EmbedParameters(unsigned int maxIterations, int numThreads, int randomSeed, bool clearConfs, bool useRandomCoords, double boxSizeMult, bool randNegEig, unsigned int numZeroFail, const std::map< int, RDGeom::Point3D > *coordMap, double optimizerForceTol, bool ignoreSmoothingFailures, bool enforceChirality, bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose, double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS, unsigned int ETversion=1, const DistGeom::BoundsMatrix *boundsMat=nullptr, bool embedFragmentsSeparately=true, bool useSmallRingTorsions=false, bool useMacrocycleTorsions=false, bool useMacrocycle14config=false, std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI=nullptr, void(*callback)(unsigned int)=nullptr)
Definition: Embedder.h:150
std::vector< unsigned int > failures
Definition: Embedder.h:147
boost::shared_ptr< const DistGeom::BoundsMatrix > boundsMat
Definition: Embedder.h:136
std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI
Definition: Embedder.h:141