RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SmilesWrite.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_SMILESWRITE_H_012020
12#define RD_SMILESWRITE_H_012020
13
14#include <string>
15#include <vector>
16#include <memory>
17#include <cstdint>
18#include <limits>
19
20#include <boost/shared_ptr.hpp>
21
22namespace RDKit {
23class Atom;
24class Bond;
25class ROMol;
26
27typedef std::vector<boost::shared_ptr<ROMol>> MOL_SPTR_VECT;
28
30 bool doIsomericSmiles =
31 true; /**< include stereochemistry and isotope information */
32 bool doKekule = false; /**< kekulize the molecule before generating the SMILES
33 and output single/double bonds. NOTE that the output
34 is not canonical and that this will thrown an
35 exception if the molecule cannot be kekulized. */
36 bool canonical = true; /**< generate canonical SMILES */
37 bool cleanStereo = true; /**< clean up stereo */
38 bool allBondsExplicit = false; /**< include symbols for all bonds */
39 bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
40 bool doRandom = false; /**< randomize the output order. The resulting SMILES
41 is not canonical */
42 int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
43 atom. The resulting SMILES is not canonical */
44 bool includeDativeBonds =
45 true; /**< include the RDKit extension for dative bonds. Otherwise dative
46 bonds will be written as single bonds*/
47 bool ignoreAtomMapNumbers = false; /**< If true, ignores any atom map numbers
48 when canonicalizing the molecule */
49};
50
51namespace SmilesWrite {
52
53#define CXSMILESFIELDS_ENUM_ITEMS \
54 CXSMILESFIELDS_ENUM_ITEM(CX_NONE, 0) \
55 CXSMILESFIELDS_ENUM_ITEM(CX_ATOM_LABELS, 1 << 0) \
56 CXSMILESFIELDS_ENUM_ITEM(CX_MOLFILE_VALUES, 1 << 1) \
57 CXSMILESFIELDS_ENUM_ITEM(CX_COORDS, 1 << 2) \
58 CXSMILESFIELDS_ENUM_ITEM(CX_RADICALS, 1 << 3) \
59 CXSMILESFIELDS_ENUM_ITEM(CX_ATOM_PROPS, 1 << 4) \
60 CXSMILESFIELDS_ENUM_ITEM(CX_LINKNODES, 1 << 5) \
61 CXSMILESFIELDS_ENUM_ITEM(CX_ENHANCEDSTEREO, 1 << 6) \
62 CXSMILESFIELDS_ENUM_ITEM(CX_SGROUPS, 1 << 7) \
63 CXSMILESFIELDS_ENUM_ITEM(CX_POLYMER, 1 << 8) \
64 CXSMILESFIELDS_ENUM_ITEM(CX_BOND_CFG, 1 << 9) \
65 CXSMILESFIELDS_ENUM_ITEM(CX_BOND_ATROPISOMER, 1 << 10) \
66 CXSMILESFIELDS_ENUM_ITEM(CX_COORDINATE_BONDS, 1 << 11) \
67 CXSMILESFIELDS_ENUM_ITEM(CX_ALL, 0x7fffffff) \
68 CXSMILESFIELDS_ENUM_ITEM(CX_ALL_BUT_COORDS, CX_ALL ^ CX_COORDS)
69
70#define CXSMILESFIELDS_ENUM_ITEM(k, v) k = (v),
72#undef CXSMILESFIELDS_ENUM_ITEM
73#define CXSMILESFIELDS_STD_MAP_ITEM(k) {#k, SmilesWrite::CXSmilesFields::k},
74#define CXSMILESFIELDS_ENUM_ITEM(k, v) CXSMILESFIELDS_STD_MAP_ITEM(k)
75#define CXSMILESFIELDS_ITEMS_MAP \
76 std::map<std::string, SmilesWrite::CXSmilesFields> { \
77 CXSMILESFIELDS_ENUM_ITEMS \
78 }
79
80//! \brief returns the cxsmiles data for a molecule
82 const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
83
84//! \brief returns the cxsmiles data for a vector of molecules
86 const std::vector<ROMol *> &mols, std::uint32_t flags);
87
88//! \brief returns true if the atom number is in the SMILES organic subset
90
91//! \brief returns the SMILES for an atom
92/*!
93 \param atom : the atom to work with
94 \param ps : the parameters controlling the SMILES generation
95*/
97 const SmilesWriteParams &ps);
98
99//! \brief returns the SMILES for an atom
100/*!
101 \param atom : the atom to work with
102 \param doKekule : we're doing kekulized smiles (e.g. don't use
103 lower case for the atom label)
104 \param bondIn : the bond we came into the atom on (unused)
105 \param allHsExplicit : if true, hydrogen counts will be provided for every
106 atom.
107 \param isomericSmiles : if true, isomeric SMILES will be generated
108*/
109inline std::string GetAtomSmiles(const Atom *atom, bool doKekule = false,
110 const Bond * = nullptr,
111 bool allHsExplicit = false,
112 bool isomericSmiles = true) {
113 // RDUNUSED_PARAM(bondIn);
116 ps.doKekule = doKekule;
117 ps.allHsExplicit = allHsExplicit;
118 return GetAtomSmiles(atom, ps);
119};
120
121//! \brief returns the SMILES for a bond
122/*!
123 \param bond : the bond to work with
124 \param ps : the parameters controlling the SMILES generation
125 \param atomToLeftIdx : the index of the atom preceding \c bond
126 in the SMILES
127*/
129 const SmilesWriteParams &ps,
130 int atomToLeftIdx = -1);
131//! \brief returns the SMILES for a bond
132/*!
133 \param bond : the bond to work with
134 \param atomToLeftIdx : the index of the atom preceding \c bond
135 in the SMILES
136 \param doKekule : we're doing kekulized smiles (e.g. write out
137 bond orders for aromatic bonds)
138 \param allBondsExplicit : if true, symbols will be included for all bonds.
139*/
140inline std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx = -1,
141 bool doKekule = false,
142 bool allBondsExplicit = false) {
144 ps.doKekule = doKekule;
145 ps.allBondsExplicit = allBondsExplicit;
146 ps.doIsomericSmiles = false;
147 return GetBondSmiles(bond, ps, atomToLeftIdx);
148};
149
150namespace detail {
152 const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles);
153}
154
155} // namespace SmilesWrite
156
157//! \brief returns canonical SMILES for a molecule
159 const ROMol &mol, const SmilesWriteParams &params);
160
161//! \brief returns SMILES for a molecule, canonical by default
162/*!
163 \param mol : the molecule in question.
164 \param doIsomericSmiles : include stereochemistry and isotope information
165 in the SMILES
166
167 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
168 this will throw an exception if the molecule cannot be kekulized.
169
170 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
171 The resulting SMILES is not, of course, canonical.
172 \param canonical : if false, no attempt will be made to canonicalize the
173 SMILES
174 \param allBondsExplicit : if true, symbols will be included for all bonds.
175 \param allHsExplicit : if true, hydrogen counts will be provided for every
176 atom.
177 \param doRandom : if true, the first atom in the SMILES string will be
178 selected at random and the SMILES string will not be canonical
179 \param ignoreAtomMapNumbers : if true, ignores any atom map numbers when
180 canonicalizing the molecule
181 */
182inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
183 bool doKekule = false, int rootedAtAtom = -1,
184 bool canonical = true,
185 bool allBondsExplicit = false,
186 bool allHsExplicit = false,
187 bool doRandom = false,
188 bool ignoreAtomMapNumbers = false) {
190 ps.doIsomericSmiles = doIsomericSmiles;
191 ps.doKekule = doKekule;
192 ps.rootedAtAtom = rootedAtAtom;
193 ps.canonical = canonical;
194 ps.allBondsExplicit = allBondsExplicit;
195 ps.allHsExplicit = allHsExplicit;
196 ps.doRandom = doRandom;
197 ps.ignoreAtomMapNumbers = ignoreAtomMapNumbers;
198 return MolToSmiles(mol, ps);
199};
200
201//! \brief returns a vector of random SMILES for a molecule (may contain
202//! duplicates)
203/*!
204 \param mol : the molecule in question.
205 \param numSmiles : the number of SMILES to return
206 \param randomSeed : if >0, will be used to seed the random number generator
207 \param doIsomericSmiles : include stereochemistry and isotope information
208 in the SMILES
209 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
210 \param allBondsExplicit : if true, symbols will be included for all bonds.
211 \param allHsExplicit : if true, hydrogen counts will be provided for every
212 atom.
213 */
215 const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
216 bool doIsomericSmiles = true, bool doKekule = false,
217 bool allBondsExplicit = false, bool allHsExplicit = false);
218
219//! \brief returns canonical SMILES for part of a molecule
221 const ROMol &mol, const SmilesWriteParams &params,
222 const std::vector<int> &atomsToUse,
223 const std::vector<int> *bondsToUse = nullptr,
224 const std::vector<std::string> *atomSymbols = nullptr,
225 const std::vector<std::string> *bondSymbols = nullptr);
226
227//! \brief returns canonical SMILES for part of a molecule
228/*!
229 \param mol : the molecule in question.
230 \param atomsToUse : indices of the atoms in the fragment
231 \param bondsToUse : indices of the bonds in the fragment. If this is not
232 provided,
233 all bonds between the atoms in atomsToUse will be included
234 \param atomSymbols : symbols to use for the atoms in the output SMILES
235 \param bondSymbols : symbols to use for the bonds in the output SMILES
236 \param doIsomericSmiles : include stereochemistry and isotope information
237 in the SMILES
238 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
239 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
240 The resulting SMILES is not, of course, canonical.
241 \param canonical : if false, no attempt will be made to canonicalize the
242 SMILES
243 \param allBondsExplicit : if true, symbols will be included for all bonds.
244 \param allHsExplicit : if true, hydrogen counts will be provided for every
245 atom.
246 \param doRandom : generate a randomized smiles string by randomly choosing
247 the priority to follow in the DFS traversal. [default false]
248
249 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
250
251 */
252inline std::string MolFragmentToSmiles(
253 const ROMol &mol, const std::vector<int> &atomsToUse,
254 const std::vector<int> *bondsToUse = nullptr,
255 const std::vector<std::string> *atomSymbols = nullptr,
256 const std::vector<std::string> *bondSymbols = nullptr,
257 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
258 bool canonical = true, bool allBondsExplicit = false,
259 bool allHsExplicit = false) {
261 ps.doIsomericSmiles = doIsomericSmiles;
262 ps.doKekule = doKekule;
263 ps.rootedAtAtom = rootedAtAtom;
264 ps.canonical = canonical;
265 ps.allBondsExplicit = allBondsExplicit;
266 ps.allHsExplicit = allHsExplicit;
267 return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
269}
270
271#define RESTOREBONDDIROPTION_ENUM_ITEMS \
272 RESTOREBONDDIROPTION_ENUM_ITEM(RestoreBondDirOptionTrue, \
273 0) /*!< DO restore bond dirs */ \
274 RESTOREBONDDIROPTION_ENUM_ITEM(RestoreBondDirOptionClear, \
275 1) /*!< clear all bond dir information */
276
277#define RESTOREBONDDIROPTION_ENUM_ITEM(k, v) k = v,
279#undef RESTOREBONDDIROPTION_ENUM_ITEM
280#define RESTOREBONDDIROPTION_STD_MAP_ITEM(k) {#k, k},
281#define RESTOREBONDDIROPTION_ENUM_ITEM(k, v) \
282 RESTOREBONDDIROPTION_STD_MAP_ITEM(k)
283#define RESTOREBONDDIROPTION_ITEMS_MAP \
284 std::map<std::string, RestoreBondDirOption> { \
285 RESTOREBONDDIROPTION_ENUM_ITEMS \
286 }
287
288//! \brief returns canonical CXSMILES for a molecule
290 const ROMol &mol, const SmilesWriteParams &ps,
291 std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL,
293
294//! \brief returns canonical CXSMILES for a molecule
295/*!
296 \param mol : the molecule in question.
297 \param doIsomericSmiles : include stereochemistry and isotope information
298 in the SMILES
299 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
300 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
301 The resulting SMILES is not, of course, canonical.
302 \param canonical : if false, no attempt will be made to canonicalize the
303 SMILES
304 \param allBondsExplicit : if true, symbols will be included for all bonds.
305 \param allHsExplicit : if true, hydrogen counts will be provided for every
306 \param doRandom : generate a randomized smiles string by randomly choosing
307 the priority to follow in the DFS traversal. [default false]
308 atom.
309 */
310inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
311 bool doKekule = false, int rootedAtAtom = -1,
312 bool canonical = true,
313 bool allBondsExplicit = false,
314 bool allHsExplicit = false,
315 bool doRandom = false) {
317 ps.doIsomericSmiles = doIsomericSmiles;
318 ps.doKekule = doKekule;
319 ps.rootedAtAtom = rootedAtAtom;
320 ps.canonical = canonical;
321 ps.allBondsExplicit = allBondsExplicit;
322 ps.allHsExplicit = allHsExplicit;
323 ps.doRandom = doRandom;
324 return MolToCXSmiles(mol, ps, SmilesWrite::CXSmilesFields::CX_ALL);
325};
326
327//! \brief returns canonical CXSMILES for part of a molecule
329 const ROMol &mol, const SmilesWriteParams &params,
330 const std::vector<int> &atomsToUse,
331 const std::vector<int> *bondsToUse = nullptr,
332 const std::vector<std::string> *atomSymbols = nullptr,
333 const std::vector<std::string> *bondSymbols = nullptr);
334
335//! \brief returns canonical CXSMILES for part of a molecule
336/*!
337 \param mol : the molecule in question.
338 \param atomsToUse : indices of the atoms in the fragment
339 \param bondsToUse : indices of the bonds in the fragment. If this is not
340 provided,
341 all bonds between the atoms in atomsToUse will be included
342 \param atomSymbols : symbols to use for the atoms in the output SMILES
343 \param bondSymbols : symbols to use for the bonds in the output SMILES
344 \param doIsomericSmiles : include stereochemistry and isotope information
345 in the SMILES
346 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
347 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
348 The resulting SMILES is not, of course, canonical.
349 \param canonical : if false, no attempt will be made to canonicalize the
350 SMILES
351 \param allBondsExplicit : if true, symbols will be included for all bonds.
352 \param allHsExplicit : if true, hydrogen counts will be provided for every
353 atom.
354
355 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
356
357 */
358inline std::string MolFragmentToCXSmiles(
359 const ROMol &mol, const std::vector<int> &atomsToUse,
360 const std::vector<int> *bondsToUse = nullptr,
361 const std::vector<std::string> *atomSymbols = nullptr,
362 const std::vector<std::string> *bondSymbols = nullptr,
363 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
364 bool canonical = true, bool allBondsExplicit = false,
365 bool allHsExplicit = false) {
367 ps.doIsomericSmiles = doIsomericSmiles;
368 ps.doKekule = doKekule;
369 ps.rootedAtAtom = rootedAtAtom;
370 ps.canonical = canonical;
371 ps.allBondsExplicit = allBondsExplicit;
372 ps.allHsExplicit = allHsExplicit;
373 return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
375}
376
378 const std::string &details_json);
380 const char *details_json);
383 const std::string &details_json);
386 const char *details_json);
387
388} // namespace RDKit
389#endif
The class for representing atoms.
Definition Atom.h:75
class for representing a bond
Definition Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition export.h:497
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles)
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, const SmilesWriteParams &ps)
returns the SMILES for an atom
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, const SmilesWriteParams &ps, int atomToLeftIdx=-1)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
Std stuff.
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
void updateSmilesWriteParamsFromJSON(SmilesWriteParams &params, const std::string &details_json)
bool rdvalue_is(const RDValue_cast_t)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL, RestoreBondDirOption restoreBondDirs=RestoreBondDirOptionClear)
returns canonical CXSMILES for a molecule
void updateCXSmilesFieldsFromJSON(SmilesWrite::CXSmilesFields &cxSmilesFields, RestoreBondDirOption &restoreBondDirs, const std::string &details_json)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule
std::vector< boost::shared_ptr< ROMol > > MOL_SPTR_VECT
RestoreBondDirOption
@ RESTOREBONDDIROPTION_ENUM_ITEMS