Functional Group Filter
Here is a problem I have come across recently:
Given a list of SMILES is there a tool or way of recognising which functional groups each SMILES has ?
Here is the list of functional groups that I would like to examine:
- Acid (carboxylate)
- Alcohol
- Alkene
- Anhydride
- Amide
- 1y Amide
- 2y Amide
- 3y Amide
- Amine
- 1y Amine
- 2y Amine
- 3y Amine
- Heteroaromatic Amines
- Anilines
- Nitrogen Bases
- Aromatic
- Carbonate
- Ester
- Ether
- Ketone/Aldehyde
- Halogen
- Chloro
- Fluoro
- Hydrocarbon
- Nitrile
- Nitro
- Phosphorus Containing
- Sulfur Containing
- Sulfide
- Sulfoxide
- Sulfite
- Sulfone
- Urea
- Silicone
- Other
RDKit offers some of this functionality through its rdkit.Chem.Fragments module. Although covering enough groups from the above collection, this is not a comprehensive list.
As always, if something is not available, I like to have a go at building it my self.
I ended up defning 37 functions - one for each of the above groups (and a couple more). I was only interested in whether a functional group is present or not, and not how many times a group might be present as well. As the code for each functional group filter is long, I will first demo the use, and provide the code at the end.
I hope you find this functionality useful and sensible. If you spot any errors or improvements, please contact me.
First let’s import the required modules:
# pandas and numpy are always a must
import pandas as pd
import numpy as np
# rdkit for sensible cheminformatics
from rdkit import Chem
from rdkit.Chem import Fragments
Let’s define a function that will be taking a pandas dataframe with a column containing SMILES and returning a table (dataframe) with functional group names (as columns) and whether each row entry has that functional group:
def ApplyFGroupsFunctions(data, smiles_col = 'SMILES'):
''' Takes a dataframe with a 'SMILES' column and returns a new dataframe
with functional groups columns containing 'Y' or 'N' values. '''
fgroup_cols = {"Alcohol": [AlcoholGroup(smi) for smi in data[smiles_col]],
"Alkene": [AlkeneGroup(smi) for smi in data[smiles_col]],
"Alkyne": [AlkyneGroup(smi) for smi in data[smiles_col]],
"Anhydride": [AnhydrideGroup(smi) for smi in data[smiles_col]],
"Aromatic": [AromaticGroup(smi) for smi in data[smiles_col]],
"Amide": [AmideGroup(smi) for smi in data[smiles_col]],
"Amine": [AmineGroup(smi) for smi in data[smiles_col]],
"Aniline": [AnilineGroup(smi) for smi in data[smiles_col]],
"BasicN": [BasicNGroup(smi) for smi in data[smiles_col]],
"Bromo": [BromoGroup(smi) for smi in data[smiles_col]],
"Carbonyl": [CarbonylGroup(smi) for smi in data[smiles_col]],
"Carbonate": [CarbonateGroup(smi) for smi in data[smiles_col]],
"Chloro": [ChloroGroup(smi) for smi in data[smiles_col]],
"Ester": [EsterGroup(smi) for smi in data[smiles_col]],
"Ether": [EtherGroup(smi) for smi in data[smiles_col]],
"Fluoro": [FluoroGroup(smi) for smi in data[smiles_col]],
"Halogen": [HalogenGroup(smi) for smi in data[smiles_col]],
"HetArAmine": [HetArAmineGroup(smi) for smi in data[smiles_col]],
"Hydrocarbon": [HydrocarbonGroup(smi) for smi in data[smiles_col]],
"Iodo": [IodoGroup(smi) for smi in data[smiles_col]],
"Ketone/Aldehude": [KetAlGroup(smi) for smi in data[smiles_col]],
"Nitrile": [NitrileGroup(smi) for smi in data[smiles_col]],
"Nitro": [NitroGroup(smi) for smi in data[smiles_col]],
"Phosporus": [PhosporusGroup(smi) for smi in data[smiles_col]],
"PrimaryAmide": [PrimaryAmideGroup(smi) for smi in data[smiles_col]],
"PrimaryAmine": [PrimaryAmineGroup(smi) for smi in data[smiles_col]],
"SecAmine": [SecAmineGroup(smi) for smi in data[smiles_col]],
"SecAmide": [SecAmideGroup (smi) for smi in data[smiles_col]],
"Silicone": [SiliconeGroup(smi) for smi in data[smiles_col]],
"Sulfide": [SulfideGroup(smi) for smi in data[smiles_col]],
"Sulfone": [SulfoneGroup(smi) for smi in data[smiles_col]],
"Sulfur": [SulfurGroup(smi) for smi in data[smiles_col]],
"Sulfite": [SulfiteGroup(smi) for smi in data[smiles_col]],
"Sulfoxide": [SulfoxideGroup(smi) for smi in data[smiles_col]],
"TertAmine": [TertAmineGroup(smi) for smi in data[smiles_col]],
"TertAmide": [TertAmideGroup(smi) for smi in data[smiles_col]],
"Urea": [UreaGroup(smi) for smi in data[smiles_col]]}
new_cols = pd.DataFrame(fgroup_cols)
return new_cols
Now let’s define a dataframe with a few molecules to test the above:
smiles = ['CC(C)=O', 'CC(=O)N(C)C', 'CC#N', 'CC(=O)OCC',
'CC(=O)OCC', 'C(=O)(OC(C)C)OC', 'C(=O)(NC)NC',
'O1CCN(C)CC1', 'C1=CC=CC2=C1C=CN=C2', 'C1=CC=CC2=C1C(=O)OC2=O',
'C1=CC=C(C4=C1C2CC(C)(N(C(N2C)=O)CCCC3=CC=C(C=C3)OC)O4)F',
'C1C(C(N1C2=CC=C(C(=C2)C(F)(F)F)C#N)=O)C(C)=O',
'CN1C[C@@H](C=C2[C@H]1CC3=CNC4=CC=CC2=C34)C(=O)O', 'CN(C)CCC1=CNC2=C1C(=CC=C2)OP(=O)(O)O']
names = ['Acetone', 'DMF', 'Acetonitrile', 'Ethyl Acetate',
'Isopropyl Acetate', 'Methyl Isopropyl Carbonate', 'Bis(Methyl) Urea',
'N-Methyl Morpholine', 'Isoquinoline', 'Aromatic Anhydride',
'Chromopynone Analogue', 'Lactam', 'Lysergic acid', 'Psylocybin']
data = pd.DataFrame(zip(names, smiles), columns = ['Names', 'SMILES'])
data
Names | SMILES | |
---|---|---|
0 | Acetone | CC(C)=O |
1 | DMF | CC(=O)N(C)C |
2 | Acetonitrile | CC#N |
3 | Ethyl Acetate | CC(=O)OCC |
4 | Isopropyl Acetate | CC(=O)OCC |
5 | Methyl Isopropyl Carbonate | C(=O)(OC(C)C)OC |
6 | Bis(Methyl) Urea | C(=O)(NC)NC |
7 | N-Methyl Morpholine | O1CCN(C)CC1 |
8 | Isoquinoline | C1=CC=CC2=C1C=CN=C2 |
9 | Aromatic Anhydride | C1=CC=CC2=C1C(=O)OC2=O |
10 | Chromopynone Analogue | C1=CC=C(C4=C1C2CC(C)(N(C(N2C)=O)CCCC3=CC=C(C=C... |
11 | Lactam | C1C(C(N1C2=CC=C(C(=C2)C(F)(F)F)C#N)=O)C(C)=O |
12 | Lysergic acid | CN1C[C@@H](C=C2[C@H]1CC3=CNC4=CC=CC2=C34)C(=O)O |
13 | Psylocybin | CN(C)CCC1=CNC2=C1C(=CC=C2)OP(=O)(O)O |
new_cols = ApplyFGroupsFunctions(data)
#stitch the data together
data = pd.concat([data, new_cols], axis=1)
#check that it worked
data.head()
Names | SMILES | Alcohol | Alkene | Alkyne | Anhydride | Aromatic | Amide | Amine | Aniline | ... | SecAmide | Silicone | Sulfide | Sulfone | Sulfur | Sulfite | Sulfoxide | TertAmine | TertAmide | Urea | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Acetone | CC(C)=O | N | N | N | N | N | N | N | N | ... | N | N | N | N | N | N | N | N | N | N |
1 | DMF | CC(=O)N(C)C | N | N | N | N | N | Y | N | N | ... | N | N | N | N | N | N | N | Y | Y | N |
2 | Acetonitrile | CC#N | N | N | N | N | N | N | N | N | ... | N | N | N | N | N | N | N | Y | N | N |
3 | Ethyl Acetate | CC(=O)OCC | N | N | N | N | N | N | N | N | ... | N | N | N | N | N | N | N | N | N | N |
4 | Isopropyl Acetate | CC(=O)OCC | N | N | N | N | N | N | N | N | ... | N | N | N | N | N | N | N | N | N | N |
5 rows × 39 columns
That is lovely!
Let’s see if we can use it to slice the data appropriately:
mask = data['Aromatic'] == 'Y'
selected = data[mask]
selected
Names | SMILES | Alcohol | Alkene | Alkyne | Anhydride | Aromatic | Amide | Amine | Aniline | ... | SecAmide | Silicone | Sulfide | Sulfone | Sulfur | Sulfite | Sulfoxide | TertAmine | TertAmide | Urea | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8 | Isoquinoline | C1=CC=CC2=C1C=CN=C2 | N | N | N | N | Y | N | Y | N | ... | N | N | N | N | N | N | N | Y | N | N |
9 | Aromatic Anhydride | C1=CC=CC2=C1C(=O)OC2=O | N | N | N | Y | Y | N | N | N | ... | N | N | N | N | N | N | N | N | N | N |
10 | Chromopynone Analogue | C1=CC=C(C4=C1C2CC(C)(N(C(N2C)=O)CCCC3=CC=C(C=C... | N | N | N | N | Y | Y | N | N | ... | N | N | N | N | N | N | N | Y | Y | Y |
11 | Lactam | C1C(C(N1C2=CC=C(C(=C2)C(F)(F)F)C#N)=O)C(C)=O | N | N | N | N | Y | Y | N | Y | ... | N | N | N | N | N | N | N | Y | N | N |
12 | Lysergic acid | CN1C[C@@H](C=C2[C@H]1CC3=CNC4=CC=CC2=C34)C(=O)O | N | Y | N | N | Y | N | Y | N | ... | N | N | N | N | N | N | N | Y | N | N |
13 | Psylocybin | CN(C)CCC1=CNC2=C1C(=CC=C2)OP(=O)(O)O | N | N | N | N | Y | N | Y | N | ... | N | N | N | N | N | N | N | Y | N | N |
6 rows × 39 columns
Let’s check visualise these molecules to show that they have an aromatic group:
legend = [name for name in selected['Names']]
mols = [Chem.MolFromSmiles(smi) for smi in selected['SMILES']]
img = Chem.Draw.MolsToGridImage(mols=mols, legends=legend)
img
That’s nice to see.
I hope you find this functionality useful and sensible. If you spot any errors or improvements, please contact me.
As always the code can also be found on Github here.
Here is the code for the Functional Group Recognition Functions:
Functional_Groups_Functions
### --- DEFINE FUNCTIONAL GROUP RECOGNITION FUNCTIONS ---
def AlcoholGroup(smi):
mol = Chem.MolFromSmiles(smi)
OH_groups = 0
OH_groups += Chem.Fragments.fr_Al_OH(mol)
OH_groups += Chem.Fragments.fr_Ar_OH(mol)
if OH_groups > 0:
return "Y"
else:
return "N"
def AmideGroup(smi):
mol = Chem.MolFromSmiles(smi)
amide_groups = 0
amide_groups += Chem.Fragments.fr_amide(mol)
if amide_groups > 0:
return "Y"
else:
return "N"
def PrimaryAmideGroup(smi):
mol = Chem.MolFromSmiles(smi)
pri_amide_groups = 0
pri_amide_groups += Chem.Fragments.fr_priamide(mol)
if pri_amide_groups > 0:
return "Y"
else:
return "N"
def AmineGroup(smi):
mol = Chem.MolFromSmiles(smi)
amine_groups = 0
amine_groups += Chem.Fragments.fr_Ar_NH(mol)
amine_groups += Chem.Fragments.fr_NH2(mol)
amine_groups += Chem.Fragments.fr_NH1(mol)
amine_groups += Chem.Fragments.fr_NH0(mol)
amine_groups -= Chem.Fragments.fr_amide(mol)
amine_groups -= Chem.Fragments.fr_amide(mol) #not an amide
amine_groups -= Chem.Fragments.fr_nitrile(mol) #not a nitrile
amine_groups -= Chem.Fragments.fr_Imine(mol) #not an imine
if amine_groups > 0:
return "Y"
else:
return "N"
def PrimaryAmineGroup(smi):
mol = Chem.MolFromSmiles(smi)
pri_amine_groups = 0
pri_amine_groups += Chem.Fragments.fr_NH2(mol)
if pri_amine_groups > 0:
return "Y"
else:
return "N"
def SecAmineGroup(smi):
mol = Chem.MolFromSmiles(smi)
sec_amine_groups = 0
sec_amine_groups += Chem.Fragments.fr_NH1(mol)
if sec_amine_groups > 0:
return "Y"
else:
return "N"
def TertAmineGroup(smi):
mol = Chem.MolFromSmiles(smi)
tert_amine_groups = 0
tert_amine_groups += Chem.Fragments.fr_NH0(mol)
if tert_amine_groups > 0:
return "Y"
else:
return "N"
def HetArAmineGroup(smi):
mol = Chem.MolFromSmiles(smi)
hetaramine_groups = 0
hetaramine_groups += Chem.Fragments.fr_Ar_N(mol)
hetaramine_groups += Chem.Fragments.fr_Nhpyrrole(mol)
hetaramine_groups += Chem.Fragments.fr_imidazole(mol)
hetaramine_groups += Chem.Fragments.fr_oxazole(mol)
hetaramine_groups += Chem.Fragments.fr_pyridine(mol)
hetaramine_groups += Chem.Fragments.fr_tetrazole(mol)
hetaramine_groups += Chem.Fragments.fr_thiazole(mol)
#more groups should go here - need to check - e.g. ??
if hetaramine_groups > 0:
return "Y"
else:
return "N"
def AnilineGroup(smi):
mol = Chem.MolFromSmiles(smi)
aniline_groups = 0
aniline_groups += Chem.Fragments.fr_aniline(mol)
if aniline_groups > 0:
return "Y"
else:
return "N"
def BasicNGroup(smi):
mol = Chem.MolFromSmiles(smi)
basic_n_groups = 0
basic_n_groups += Chem.Fragments.fr_piperdine(mol)
basic_n_groups += Chem.Fragments.fr_piperzine(mol)
basic_n_groups += Chem.Fragments.fr_oxime(mol)
basic_n_groups += Chem.Fragments.fr_morpholine(mol)
basic_n_groups += Chem.Fragments.fr_amidine(mol)
basic_n_groups += Chem.Fragments.fr_Imine(mol)
basic_n_groups += Chem.Fragments.fr_guanido(mol)
basic_n_groups += int(any([AmineGroup(smi)=="Y",
PrimaryAmineGroup(smi)=="Y",
SecAmineGroup(smi)=="Y",
TertAmineGroup(smi)=="Y",
HetArAmineGroup(smi)=="Y",
AnilineGroup(smi)=="Y"]))
if basic_n_groups > 0:
return "Y"
else:
return "N"
def EsterGroup(smi):
mol = Chem.MolFromSmiles(smi)
ester_groups = 0
ester_groups += Chem.Fragments.fr_ester(mol) # this also hits anhydrides
anhydride_smarts = '[CX3](=[OX1])[OX2][CX3](=[OX1])'
pattern = Chem.MolFromSmarts(anhydride_smarts)
matches = mol.GetSubstructMatches(pattern)
ester_groups -= 2*len(matches) #remove number of anhydride groups - two esters per anhydride
if ester_groups > 0:
return "Y"
else:
return "N"
def EtherGroup(smi):
ether_smarts = '[OD2]([#6])[#6]' # this also hits esters
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(ether_smarts)
matches = mol.GetSubstructMatches(pattern)
ether_groups = 0
ether_groups += len(matches)
ether_groups -= Chem.Fragments.fr_ester(mol) # not an ester
if ether_groups > 0:
return "Y"
else:
return "N"
def KetAlGroup(smi):
mol = Chem.MolFromSmiles(smi)
ketal_groups = 0
ketal_groups += Chem.Fragments.fr_ketone_Topliss(mol)
ketal_groups += Chem.Fragments.fr_ketone(mol)
ketal_groups += Chem.Fragments.fr_aldehyde(mol)
if ketal_groups > 0:
return "Y"
else:
return "N"
def CarbonylGroup(smi):
mol = Chem.MolFromSmiles(smi)
co_groups = 0
co_groups += Chem.Fragments.fr_C_O_noCOO(mol)
if co_groups > 0:
return "Y"
else:
return "N"
def HalogenGroup(smi):
mol = Chem.MolFromSmiles(smi)
halogen_groups = 0
halogen_groups += Chem.Fragments.fr_alkyl_halide(mol)
halogen_groups += Chem.Fragments.fr_halogen(mol)
if halogen_groups > 0:
return "Y"
else:
return "N"
def ChloroGroup(smi):
if 'Cl' in smi:
return "Y"
else:
return "N"
def BromoGroup(smi):
if 'Br' in smi:
return "Y"
else:
return "N"
def FluoroGroup(smi):
if 'F' in smi:
return "Y"
else:
return "N"
def IodoGroup(smi):
if 'I' in smi:
return "Y"
else:
return "N"
def SiliconeGroup(smi):
if 'Si'in smi:
return "Y"
else:
return "N"
def PhosporusGroup(smi):
if 'P' in smi:
return "Y"
else:
return "N"
def SulfurGroup(smi):
if (('S' in smi) and ('Si' not in smi)):
return "Y"
else:
return "N"
def HydrocarbonGroup(smi):
mol = Chem.MolFromSmiles(smi)
hydrocarb = 0
hydrocarb += Chem.Fragments.fr_unbrch_alkane(mol)
if hydrocarb > 0:
return "Y"
else:
return "N"
def NitrileGroup(smi):
mol = Chem.MolFromSmiles(smi)
nitrile = 0
nitrile += Chem.Fragments.fr_nitrile(mol)
if nitrile > 0:
return "Y"
else:
return "N"
def NitroGroup(smi):
mol = Chem.MolFromSmiles(smi)
nitro = 0
nitro += Chem.Fragments.fr_nitro(mol)
if nitro > 0:
return "Y"
else:
return "N"
def SulfideGroup(smi):
mol = Chem.MolFromSmiles(smi)
sulfide = 0
sulfide += Chem.Fragments.fr_sulfide(mol)
if sulfide > 0:
return "Y"
else:
return "N"
def SulfoneGroup(smi):
mol = Chem.MolFromSmiles(smi)
sulfone = 0
sulfone += Chem.Fragments.fr_sulfone(mol)
if sulfone > 0:
return "Y"
else:
return "N"
def UreaGroup(smi):
mol = Chem.MolFromSmiles(smi)
urea = 0
urea += Chem.Fragments.fr_urea(mol)
if urea > 0:
return "Y"
else:
return "N"
#define new functions that use SMARTS
def AlkeneGroup(smi):
alkene_smarts = 'C=C'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(alkene_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def AlkyneGroup(smi):
alkyne_smarts = '[CX2]#[CX2]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(alkyne_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def AnhydrideGroup(smi):
anhydride_smarts = '[CX3](=[OX1])[OX2][CX3](=[OX1])'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(anhydride_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def SecAmideGroup(smi):
sec_amide_smarts = '[CX3][NX3;H1][CX3](=[OX1])[#6]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(sec_amide_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def TertAmideGroup(smi):
tert_amide_smarts = '[NX3](-[CX4])(-[CX4])-[CX3]=[O]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(tert_amide_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def AromaticGroup(smi):
'''This is very challenging. My approach is to get the rings, and see if any are aromatic.'''
# To detect aromatic rings, I would loop over the bonds in each ring and
# flag the ring as aromatic if all bonds are aromatic:
# https://www.rdkit.org/docs/Cookbook.html#identify-aromatic-rings
def isRingAromatic(mol, bondRing):
for id in bondRing:
if not mol.GetBondWithIdx(id).GetIsAromatic():
return False
return True
mol = Chem.MolFromSmiles(smi)
rings = mol.GetRingInfo()
aromatics = [isRingAromatic(mol, ring) for ring in rings.BondRings()] #returns boolean
if any(aromatics):
return 'Y'
else:
return 'N'
def CarbonateGroup(smi):
carbonate_smarts = '[CX3](=[O])(-[O])-[O]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(carbonate_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def SulfoxideGroup(smi):
sulfoxide_smarts = '[S;D3](=[O])(-[C])-[C]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(sulfoxide_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'
def SulfiteGroup(smi):
sulfite_smarts = '[S;D2](-[C])-[C]'
mol = Chem.MolFromSmiles(smi)
pattern = Chem.MolFromSmarts(sulfite_smarts)
matches = mol.GetSubstructMatches(pattern)
if len(matches) > 0:
return 'Y'
else:
return 'N'