diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..4cb2858 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,25 @@ +name: Linting + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.7", "3.10" ] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: pip install tox + - name: Check code quality with flake8 + run: tox -e flake8 diff --git a/README.md b/README.md index 4cb40e1..4678e66 100644 --- a/README.md +++ b/README.md @@ -1002,3 +1002,11 @@ Generation of analogues of the specified molecule and unbiased library enumerati * ``enumerationInstance.getReconstructedMols(mol)`` - fragment provided molecule (should be RdKit molecule object and not smiles). Return two dictionary - *allSyntheticPathways* and *allSynthons* obtained during fragmentation. +# Development + +This code is auto-formated with [`black`](https://github.com/psf/black) and [`isort`](https://pycqa.github.io/isort/) and checked with continuous integration to ensure code formatting has been properly applied. Please run the following in all pull requests: + +```shell +$ pip install tox +$ tox +``` diff --git a/SyntOn_BBScaffoldGeneration.py b/SyntOn_BBScaffoldGeneration.py index db06242..54b4303 100644 --- a/SyntOn_BBScaffoldGeneration.py +++ b/SyntOn_BBScaffoldGeneration.py @@ -1,9 +1,13 @@ -import sys,os +import os +import sys + from rdkit import Chem + srcPath = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(1, srcPath) -from src.UsefulFunctions import * from src.SyntOn_BBs import * +from src.UsefulFunctions import * + def main(args): with open(args.output + "_Scaffolds.smi", "w") as out: @@ -21,7 +25,9 @@ def main(args): out.write(" " + scaffold + "\n") else: out.write(" linearMolecule\n") - scaffoldsCountSorted = {r: scaffoldsCount[r] for r in sorted(scaffoldsCount, key=scaffoldsCount.get, reverse=True)} + scaffoldsCountSorted = { + r: scaffoldsCount[r] for r in sorted(scaffoldsCount, key=scaffoldsCount.get, reverse=True) + } scaffoldsCount.clear() with open(args.output + "_scaffoldsCounts.smi", "w") as outCounts: for scaffold in scaffoldsCountSorted: @@ -30,41 +36,49 @@ def main(args): cumSum = 0 TotalCompNumb = sum(scaffoldsCountSorted.values()) TotalScaffNumb = len(scaffoldsCountSorted) - for ind,scaff in enumerate(scaffoldsCountSorted): + for ind, scaff in enumerate(scaffoldsCountSorted): cumSum += scaffoldsCountSorted[scaff] - outCumPer.write(str(int(round((ind + 1) / TotalScaffNumb * 100))) + " " + str( - int(round(cumSum / TotalCompNumb * 100))) + "\n") + outCumPer.write( + str(int(round((ind + 1) / TotalScaffNumb * 100))) + + " " + + str(int(round(cumSum / TotalCompNumb * 100))) + + "\n" + ) scaffoldsCountSorted.clear() scaffoldPlot(args.output + "_cumulativeprecentage.smi", args.output) - def scaffoldPlot(cumPercentageFile, outName): from matplotlib import pyplot as plt from numpy import genfromtxt - Data = genfromtxt(cumPercentageFile, delimiter=' ', names=['x', 'y']) + + Data = genfromtxt(cumPercentageFile, delimiter=" ", names=["x", "y"]) fig, ax = plt.subplots() - ax.tick_params(axis='both', which='major', labelsize=12) - plt.plot(Data['x'], Data['y'], color="darkgreen") + ax.tick_params(axis="both", which="major", labelsize=12) + plt.plot(Data["x"], Data["y"], color="darkgreen") plt.ylim(ymin=0, ymax=100) plt.xlim(xmin=0, xmax=100) - plt.ylabel("Fraction of BBs, %", fontweight='bold', fontsize=14) - plt.xlabel("Fraction of scaffolds, %", fontweight='bold', fontsize=14) - plt.title("Cumulative Scaffold Frequency Plot", fontweight='bold', fontsize=14) + plt.ylabel("Fraction of BBs, %", fontweight="bold", fontsize=14) + plt.xlabel("Fraction of scaffolds, %", fontweight="bold", fontsize=14) + plt.title("Cumulative Scaffold Frequency Plot", fontweight="bold", fontsize=14) plt.savefig("Scaffolds_FreqPlot_" + outName + ".png") -if __name__ == '__main__': +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="BBs Scaffold analysis. Generates meaningful BBs scaffolds after removing ring-containing leaving and protective groups. Count scaffolds occurrence in the provided collection of BBs, and construct cumulative scaffold frequency plot.", - epilog="Code implementation: Yuliana Zabolotna, Alexandre Varnek\n" - " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" - "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" - " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" - " Kyiv National Taras Shevchenko University\n" - "2021 Strasbourg, Kiev", - prog="SyntOn_BBScaffoldGeneration", formatter_class=argparse.RawTextHelpFormatter) + + parser = argparse.ArgumentParser( + description="BBs Scaffold analysis. Generates meaningful BBs scaffolds after removing ring-containing leaving and protective groups. Count scaffolds occurrence in the provided collection of BBs, and construct cumulative scaffold frequency plot.", + epilog="Code implementation: Yuliana Zabolotna, Alexandre Varnek\n" + " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" + "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" + " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" + " Kyiv National Taras Shevchenko University\n" + "2021 Strasbourg, Kiev", + prog="SyntOn_BBScaffoldGeneration", + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument("-i", "--input", type=str, help="Input BBs file.") parser.add_argument("-o", "--output", type=str, help="Output files suffix name.") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign.py b/SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign.py index 84e7238..4c45439 100644 --- a/SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign.py +++ b/SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign.py @@ -1,55 +1,92 @@ +import datetime +import os +import random +import re +import resource +import sys +import time import xml.etree.ElementTree as ET +from collections import Counter +from concurrent.futures import ProcessPoolExecutor +from functools import partial +from multiprocessing import Process, Queue + from rdkit import Chem, DataStructs -from rdkit.Chem import rdChemReactions as Reactions from rdkit.Chem import AddHs, AllChem -from rdkit.Chem.MolStandardize import rdMolStandardize -from rdkit.Chem.rdmolops import * -from rdkit.Chem.rdMolDescriptors import CalcNumRings +from rdkit.Chem import rdChemReactions as Reactions +from rdkit.Chem.Crippen import MolLogP from rdkit.Chem.Descriptors import * +from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem.rdMolDescriptors import * -from rdkit.Chem.Crippen import MolLogP -import datetime, os, time, random, re, resource, sys -from multiprocessing import Process, Queue -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from collections import Counter +from rdkit.Chem.rdMolDescriptors import CalcNumRings +from rdkit.Chem.rdmolops import * + srcPath = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(1, srcPath) -from src.UsefulFunctions import * from src.SyntOn import * +from src.UsefulFunctions import * -def main(inp, SynthLibrary, outDir, simTh, strictAvailabilityMode, nCores=-1, analoguesLibGen=False, - Ro2Filtration=False, fragmentationMode="use_all", reactionsToWorkWith = "R1-R13", MaxNumberOfStages=5, - maxNumberOfReactionCentersPerFragment=3, desiredNumberOfNewMols = 1000, enumerationMode=False, MWupperTh=1000, - MWlowerTh=100): +def main( + inp, + SynthLibrary, + outDir, + simTh, + strictAvailabilityMode, + nCores=-1, + analoguesLibGen=False, + Ro2Filtration=False, + fragmentationMode="use_all", + reactionsToWorkWith="R1-R13", + MaxNumberOfStages=5, + maxNumberOfReactionCentersPerFragment=3, + desiredNumberOfNewMols=1000, + enumerationMode=False, + MWupperTh=1000, + MWlowerTh=100, +): SmilesToIgnore = ["*C(C)C", "*C(=O)C", "*C=O", "*[V]C=O", "*[V]C(C)C", "*[V]C(=O)C"] if simTh == -1: simBBselection = False else: simBBselection = True - SyntOnfragmentor = fragmentation(fragmentationMode=fragmentationMode, reactionsToWorkWith=reactionsToWorkWith, - maxNumberOfReactionCentersPerFragment=maxNumberOfReactionCentersPerFragment, - MaxNumberOfStages = MaxNumberOfStages, FragmentsToIgnore=SmilesToIgnore, - SynthLibrary=SynthLibrary, FindAnaloguesOfMissingSynthons=simBBselection, - Ro2SynthonsFiltration=Ro2Filtration) - if analoguesLibGen and nCores==-1: + SyntOnfragmentor = fragmentation( + fragmentationMode=fragmentationMode, + reactionsToWorkWith=reactionsToWorkWith, + maxNumberOfReactionCentersPerFragment=maxNumberOfReactionCentersPerFragment, + MaxNumberOfStages=MaxNumberOfStages, + FragmentsToIgnore=SmilesToIgnore, + SynthLibrary=SynthLibrary, + FindAnaloguesOfMissingSynthons=simBBselection, + Ro2SynthonsFiltration=Ro2Filtration, + ) + if analoguesLibGen and nCores == -1: for molNumb, line in enumerate(open(inp)): if line.strip(): Smiles = line.strip().split()[0] - analoguesLibraryGeneration((Smiles, molNumb+1), SyntOnfragmentor, outDir, simTh = simTh, - strictAvailabilityMode=strictAvailabilityMode, - desiredNumberOfNewMols=desiredNumberOfNewMols) + analoguesLibraryGeneration( + (Smiles, molNumb + 1), + SyntOnfragmentor, + outDir, + simTh=simTh, + strictAvailabilityMode=strictAvailabilityMode, + desiredNumberOfNewMols=desiredNumberOfNewMols, + ) elif analoguesLibGen: Smiles_molNumb_List = [] for molNumb, line in enumerate(open(inp)): if line.strip(): Smiles = line.strip().split()[0] - Smiles_molNumb_List.append((Smiles, molNumb+1)) - fixed_analogsGenerationFunction = partial(analoguesLibraryGeneration, outDir=outDir, simTh=simTh, - SyntOnfragmentor=SyntOnfragmentor, strictAvailabilityMode=strictAvailabilityMode, - desiredNumberOfNewMols=desiredNumberOfNewMols) + Smiles_molNumb_List.append((Smiles, molNumb + 1)) + fixed_analogsGenerationFunction = partial( + analoguesLibraryGeneration, + outDir=outDir, + simTh=simTh, + SyntOnfragmentor=SyntOnfragmentor, + strictAvailabilityMode=strictAvailabilityMode, + desiredNumberOfNewMols=desiredNumberOfNewMols, + ) nCores = args.nCores finalLog = [] with ProcessPoolExecutor(max_workers=nCores) as executor: @@ -64,12 +101,23 @@ def main(inp, SynthLibrary, outDir, simTh, strictAvailabilityMode, nCores=-1, an Smiles = line.strip().split()[0] allReagentSets, allSynthons = fragmentMolecule(Smiles, SyntOnfragmentor, simTh) - if allReagentSets and len(allReagentSets)>1: + if allReagentSets and len(allReagentSets) > 1: fsynthonsAfterOneCut = getShortestSyntheticPathways(allReagentSets) - shortestSynthesis = findShortestSynthPathWithAvailableBBlib(fsynthonsAfterOneCut, showAll=False, - firstLaunch = True) - outOnePath.write(line.strip() + " " + ".".join([x.smiles for x in shortestSynthesis[0].participatingSynthon]) + " " +shortestSynthesis[0].name + - " " + str(shortestSynthesis[0].reagentsNumber) + " " + str(shortestSynthesis[0].availabilityRate) + " ") + shortestSynthesis = findShortestSynthPathWithAvailableBBlib( + fsynthonsAfterOneCut, showAll=False, firstLaunch=True + ) + outOnePath.write( + line.strip() + + " " + + ".".join([x.smiles for x in shortestSynthesis[0].participatingSynthon]) + + " " + + shortestSynthesis[0].name + + " " + + str(shortestSynthesis[0].reagentsNumber) + + " " + + str(shortestSynthesis[0].availabilityRate) + + " " + ) outOnePath.flush() availableSynthons = {} notAvaialableSynthons = [] @@ -79,7 +127,12 @@ def main(inp, SynthLibrary, outDir, simTh, strictAvailabilityMode, nCores=-1, an else: notAvaialableSynthons.append(synth.smiles) notAvaialableSynthons = list(set(notAvaialableSynthons)) - outOnePath.write("AvailableSynthons:" + "|".join([synth + "->" + availableSynthons[synth] for synth in availableSynthons])) + outOnePath.write( + "AvailableSynthons:" + + "|".join( + [synth + "->" + availableSynthons[synth] for synth in availableSynthons] + ) + ) outOnePath.flush() if notAvaialableSynthons: outOnePath.write(" NotAvailableSynthons:" + "|".join(notAvaialableSynthons)) @@ -97,9 +150,13 @@ def main(inp, SynthLibrary, outDir, simTh, strictAvailabilityMode, nCores=-1, an notAvaialableSynthons.append(synth.smiles) availableSynthons = list(set(availableSynthons)) notAvaialableSynthons = list(set(notAvaialableSynthons)) - outAllSynthons.write(line.strip() +" AvailableSynthons:" + ".".join(availableSynthons)) + outAllSynthons.write( + line.strip() + " AvailableSynthons:" + ".".join(availableSynthons) + ) if notAvaialableSynthons: - outAllSynthons.write(" notAvaialableSynthons:" + ".".join(notAvaialableSynthons)) + outAllSynthons.write( + " notAvaialableSynthons:" + ".".join(notAvaialableSynthons) + ) outAllSynthons.write("\n") outAllSynthons.flush() shortestSynthesis.clear() @@ -123,82 +180,172 @@ def main(inp, SynthLibrary, outDir, simTh, strictAvailabilityMode, nCores=-1, an sline = line.strip() if sline and "*" not in sline: synthons.append(sline.split()[0]) - enumerator = enumeration(outDir=outDir, Synthons=list(set(synthons)), - reactionSMARTS=reactionForReconstruction, maxNumberOfReactedSynthons=MaxNumberOfStages+1, - MWupperTh=MWupperTh, MWlowerTh = MWlowerTh, - desiredNumberOfNewMols = desiredNumberOfNewMols, nCores = nCores) + enumerator = enumeration( + outDir=outDir, + Synthons=list(set(synthons)), + reactionSMARTS=reactionForReconstruction, + maxNumberOfReactedSynthons=MaxNumberOfStages + 1, + MWupperTh=MWupperTh, + MWlowerTh=MWlowerTh, + desiredNumberOfNewMols=desiredNumberOfNewMols, + nCores=nCores, + ) reconstructedMols = enumerator.getReconstructedMols(allowedToRunSubprocesses=True) -if __name__ == '__main__': + + +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Compound fragmentaitiona and analogues generation.", - epilog="_________________________________________________________________________________________________________________________" - "\n\nCode implementation: Yuliana Zabolotna, Alexandre Varnek\n" - " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" - "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" - " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" - " Kyiv National Taras Shevchenko University\n" - "2021 Strasbourg, Kiev", - prog="SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign", formatter_class=argparse.RawTextHelpFormatter) + + parser = argparse.ArgumentParser( + description="Compound fragmentaitiona and analogues generation.", + epilog="_________________________________________________________________________________________________________________________" + "\n\nCode implementation: Yuliana Zabolotna, Alexandre Varnek\n" + " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" + "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" + " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" + " Kyiv National Taras Shevchenko University\n" + "2021 Strasbourg, Kiev", + prog="SyntOn_BulkFragmentationEnumerationAndAnaloguesDesign", + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument("-i", "--input", type=str, help="input file") parser.add_argument("-oD", "--outDir", type=str, help="Output directory to write analogues.") - parser.add_argument("--SynthLibrary", type=str, default=None, help="Library of available synthons. Generated from avaialable BBs using SyntOn_BBsBulkClassificationAndSynthonization.py") - parser.add_argument("--nCores", default=-1, type=int, help="Number of CPUs available for parallelization.") - parser.add_argument("--simTh", default=-1, type=float, help="Similarity threshold for BB analogues search. " - "If not specified, only positional variational approach will be used for BBs search") - parser.add_argument("--analoguesLibGen", action="store_true", help="Generate library of analogues from input mol") - parser.add_argument("--strictAvailabilityMode", action="store_true", help="Only fully synthesizable analogues are generated. " - "Alternatively, unavailable synthons resulted from compound fragmentation" - " will still be used for its analogues generation.") - parser.add_argument("--Ro2Filtration", action="store_true", help="Filter input synthons library by Ro2 (MW <= 200, logP <= 2, H-bond donors count <= 2 and H-bond acceptors count <= 4)") - parser.add_argument("--fragmentationMode", default="use_all", type=str, help="Mode of fragmentation (defines how the reaction list is specified)" - "\nPossible options: use_all, include_only, exclude_some, one_by_one" - "\n(default: use_all)") - parser.add_argument("--reactionsToWorkWith", default="R1-R13", type=str, help="List of RiDs to be used." - "\n(default: R1-R13 (all reactions) ") - parser.add_argument("--desiredNumberOfNewMols", default=1000, type=int, - help="Desired number of new compounds to be generated (in case of anaogues generation - number of analogues per compound)." - "\n(default: 1000)") + parser.add_argument( + "--SynthLibrary", + type=str, + default=None, + help="Library of available synthons. Generated from avaialable BBs using SyntOn_BBsBulkClassificationAndSynthonization.py", + ) + parser.add_argument( + "--nCores", + default=-1, + type=int, + help="Number of CPUs available for parallelization.", + ) + parser.add_argument( + "--simTh", + default=-1, + type=float, + help="Similarity threshold for BB analogues search. " + "If not specified, only positional variational approach will be used for BBs search", + ) + parser.add_argument( + "--analoguesLibGen", + action="store_true", + help="Generate library of analogues from input mol", + ) + parser.add_argument( + "--strictAvailabilityMode", + action="store_true", + help="Only fully synthesizable analogues are generated. " + "Alternatively, unavailable synthons resulted from compound fragmentation" + " will still be used for its analogues generation.", + ) + parser.add_argument( + "--Ro2Filtration", + action="store_true", + help="Filter input synthons library by Ro2 (MW <= 200, logP <= 2, H-bond donors count <= 2 and H-bond acceptors count <= 4)", + ) + parser.add_argument( + "--fragmentationMode", + default="use_all", + type=str, + help="Mode of fragmentation (defines how the reaction list is specified)" + "\nPossible options: use_all, include_only, exclude_some, one_by_one" + "\n(default: use_all)", + ) + parser.add_argument( + "--reactionsToWorkWith", + default="R1-R13", + type=str, + help="List of RiDs to be used." "\n(default: R1-R13 (all reactions) ", + ) + parser.add_argument( + "--desiredNumberOfNewMols", + default=1000, + type=int, + help="Desired number of new compounds to be generated (in case of anaogues generation - number of analogues per compound)." + "\n(default: 1000)", + ) - parser.add_argument("--MaxNumberOfStages", default=5, type=int, help="Maximal number of stages during fragmentation or enumeration." - "\n(default: 5)") - parser.add_argument("--maxNumberOfReactionCentersPerFragment", default=3, type=int, help="Maximal number of reaction centers per fragment." - "\n(default: 3)") - parser.add_argument("--enumerationMode", action="store_true", help="Enumerate library using input synthons") - parser.add_argument("--MWupperTh", default=1000, type=int, - help="Maximum molecular weight allowed for generated compounds." - "\n(default: 1000)") - parser.add_argument("--MWlowerTh", default=100, type=int, - help="Minimum molecular weight allowed for generated compounds." - "\n(default: 100)") + parser.add_argument( + "--MaxNumberOfStages", + default=5, + type=int, + help="Maximal number of stages during fragmentation or enumeration." "\n(default: 5)", + ) + parser.add_argument( + "--maxNumberOfReactionCentersPerFragment", + default=3, + type=int, + help="Maximal number of reaction centers per fragment." "\n(default: 3)", + ) + parser.add_argument( + "--enumerationMode", + action="store_true", + help="Enumerate library using input synthons", + ) + parser.add_argument( + "--MWupperTh", + default=1000, + type=int, + help="Maximum molecular weight allowed for generated compounds." "\n(default: 1000)", + ) + parser.add_argument( + "--MWlowerTh", + default=100, + type=int, + help="Minimum molecular weight allowed for generated compounds." "\n(default: 100)", + ) args = parser.parse_args() if args.nCores == -1 or args.analoguesLibGen or args.enumerationMode: - main(args.input, args.SynthLibrary, args.outDir, args.simTh, args.strictAvailabilityMode, args.nCores, - args.analoguesLibGen, args.Ro2Filtration, fragmentationMode=args.fragmentationMode, - reactionsToWorkWith = args.reactionsToWorkWith, - MaxNumberOfStages=args.MaxNumberOfStages, - maxNumberOfReactionCentersPerFragment=args.maxNumberOfReactionCentersPerFragment, - desiredNumberOfNewMols=args.desiredNumberOfNewMols,enumerationMode=args.enumerationMode, - MWupperTh=args.MWupperTh, MWlowerTh=args.MWlowerTh) + main( + args.input, + args.SynthLibrary, + args.outDir, + args.simTh, + args.strictAvailabilityMode, + args.nCores, + args.analoguesLibGen, + args.Ro2Filtration, + fragmentationMode=args.fragmentationMode, + reactionsToWorkWith=args.reactionsToWorkWith, + MaxNumberOfStages=args.MaxNumberOfStages, + maxNumberOfReactionCentersPerFragment=args.maxNumberOfReactionCentersPerFragment, + desiredNumberOfNewMols=args.desiredNumberOfNewMols, + enumerationMode=args.enumerationMode, + MWupperTh=args.MWupperTh, + MWlowerTh=args.MWlowerTh, + ) else: wc = countLines(args.input) - if wc= simTh : + if ( + self.marks == synthLib[synth]["marks"] + and synth != self.smiles + and refMarksVallences == synthLib[synth]["marksVallences"] + ): + if ( + simTh != -1 + and DataStructs.TanimotoSimilarity(queryFP, synthLib[synth]["fp_b"]) >= simTh + ): self.bbAnalogues[synth] = synthLib[synth]["BBs"] else: if self.AtomNumbers == 0: self.AtomNumbers = refMol.GetNumAtoms() - if synthLib[synth]["n_atoms"] <= self.AtomNumbers + 1 and synthLib[synth]["n_atoms"] >= self.AtomNumbers - 1: - refList = [i for i in self.smiles if i.isalpha() and i!="H"] - qList = [i for i in synth if i.isalpha() and i!="H"] + if ( + synthLib[synth]["n_atoms"] <= self.AtomNumbers + 1 + and synthLib[synth]["n_atoms"] >= self.AtomNumbers - 1 + ): + refList = [i for i in self.smiles if i.isalpha() and i != "H"] + qList = [i for i in synth if i.isalpha() and i != "H"] qList_refList = list((Counter(qList) - Counter(refList)).elements()) refList_qList = list((Counter(refList) - Counter(qList)).elements()) - if synthLib[synth]["n_atoms"] == self.AtomNumbers and sorted(refList_qList + qList_refList) == ['c', 'n']: + if synthLib[synth]["n_atoms"] == self.AtomNumbers and sorted( + refList_qList + qList_refList + ) == ["c", "n"]: refMolRings = CalcNumRings(refMol) if synthLib[synth]["n_rings"] == refMolRings: self.bbAnalogues[synth] = synthLib[synth]["BBs"] @@ -68,7 +115,11 @@ def searchForSynthonAnalogues(self, synthLib: dict, simTh=-1): refMolRings = CalcNumRings(refMol) if synthLib[synth]["n_rings"] == refMolRings: self.bbAnalogues[synth] = synthLib[synth]["BBs"] - elif qList_refList and len(qList_refList) == 1 and qList_refList[0] in posAnaloguesScreeningAtomsAllowed: + elif ( + qList_refList + and len(qList_refList) == 1 + and qList_refList[0] in posAnaloguesScreeningAtomsAllowed + ): refMolRings = CalcNumRings(refMol) if synthLib[synth]["n_rings"] == refMolRings: analogMol = Chem.MolFromSmiles(synth) @@ -77,7 +128,11 @@ def searchForSynthonAnalogues(self, synthLib: dict, simTh=-1): analogMol.GetRingInfo().AtomRings() if analogMol.HasSubstructMatch(refMol): self.bbAnalogues[synth] = synthLib[synth]["BBs"] - elif refList_qList and len(refList_qList) == 0 and refList_qList[0] in posAnaloguesScreeningAtomsAllowed: + elif ( + refList_qList + and len(refList_qList) == 0 + and refList_qList[0] in posAnaloguesScreeningAtomsAllowed + ): refMolRings = CalcNumRings(refMol) if synthLib[synth]["n_rings"] == refMolRings: analogMol = Chem.MolFromSmiles(synth) @@ -89,15 +144,22 @@ def searchForSynthonAnalogues(self, synthLib: dict, simTh=-1): def printSynthonInfo(self): print("\n__________________________________________________________") - print(" Synthon Information" ) + print(" Synthon Information") print("__________________________________________________________\n") print("Synthon: " + self.smiles) if self.SynthLibProvided: if self.correspondingBB: print("Available. Corresponding BBs: " + self.correspondingBB) elif self.bbAnalogues: - print("Synthon was not found in provided library of building blocks. " + str(len(self.bbAnalogues)) + " analog(s) has/have been found") - print("BB analogues:\n" + "\n".join([x + " " + self.bbAnalogues[x] for x in self.bbAnalogues])) + print( + "Synthon was not found in provided library of building blocks. " + + str(len(self.bbAnalogues)) + + " analog(s) has/have been found" + ) + print( + "BB analogues:\n" + + "\n".join([x + " " + self.bbAnalogues[x] for x in self.bbAnalogues]) + ) else: print("Synthon was not found in provided library of building blocks") if self.directParents: @@ -105,7 +167,9 @@ def printSynthonInfo(self): else: print("Parent synthons: - ") if self.directChildren: - print("Children synthons: " + ".".join(list(set([x.smiles for x in self.directChildren])))) + print( + "Children synthons: " + ".".join(list(set([x.smiles for x in self.directChildren]))) + ) else: print("Children synthons: - ") @@ -127,10 +191,19 @@ def Ro2Filtration(self): else: return True + class syntheticPathway: - def __init__(self, name , synthPathwayReactions, reagentsNumber, cutLevel, directParentsSynthPathways=None, SynthLibProvided=False): + def __init__( + self, + name, + synthPathwayReactions, + reagentsNumber, + cutLevel, + directParentsSynthPathways=None, + SynthLibProvided=False, + ): self.name = name - self.participatingSynthon = [] + self.participatingSynthon = [] self.directChildrenSynthPathways = [] self.directParentsSynthPathways = [] if directParentsSynthPathways: @@ -147,7 +220,7 @@ def checkAvailability(self, SynthLib: dict, simTh=-1, FindAnaloguesOfMissingSynt availableAtomCounts = 0 allAtoms = 0 for synth in self.participatingSynthon: - if synth.AtomNumbers==0: + if synth.AtomNumbers == 0: synth.AtomNumbers = Chem.MolFromSmiles(synth.smiles).GetNumAtoms() allAtoms += synth.AtomNumbers if synth.smiles in SynthLib: @@ -158,80 +231,141 @@ def checkAvailability(self, SynthLib: dict, simTh=-1, FindAnaloguesOfMissingSynt self.availabilityRate = round(availableAtomCounts / allAtoms, 2) if self.directChildrenSynthPathways: for comb in self.directChildrenSynthPathways: - if comb.availabilityRate==0: + if comb.availabilityRate == 0: comb.checkAvailability(SynthLib, simTh, FindAnaloguesOfMissingSynthons) def printShortReagentSetInfo(self): - if self.name=="zeroCombin": + if self.name == "zeroCombin": return if self.SynthLibProvided: - print(self.name + " " + ".".join([x.smiles for x in self.participatingSynthon]) + " " + "Availability rate (% of atoms of fragmented molecule coming from available synthons): " + str(self.availabilityRate)) + print( + self.name + + " " + + ".".join([x.smiles for x in self.participatingSynthon]) + + " " + + "Availability rate (% of atoms of fragmented molecule coming from available synthons): " + + str(self.availabilityRate) + ) else: print(self.name + " " + ".".join([x.smiles for x in self.participatingSynthon])) def printDetailedReagentsSetInfo(self): - if self.name!="zeroCombin": + if self.name != "zeroCombin": print("\n**********************************************************") - print("Reagent set Information " + self.name ) + print("Reagent set Information " + self.name) print("**********************************************************") print("Reactions: " + "->".join(self.synthPathwayReactions)) print("Required Synthons: " + ".".join([x.smiles for x in self.participatingSynthon])) print("Number of reagents: " + str(self.reagentsNumber)) print("Number of stages: " + str(self.cutLevel)) if self.SynthLibProvided: - print("Availability rate (% of atoms of fragmented molecule coming from available synthons): " + str(self.availabilityRate)) + print( + "Availability rate (% of atoms of fragmented molecule coming from available synthons): " + + str(self.availabilityRate) + ) if self.directChildrenSynthPathways: print("Children reagent sets: ") for comb in self.directChildrenSynthPathways: - print("Reactions: " + comb.name + " " + "->".join(comb.synthPathwayReactions) + " ||| " + "Participating synthons: " + ".".join([x.smiles for x in comb.participatingSynthon])) + print( + "Reactions: " + + comb.name + + " " + + "->".join(comb.synthPathwayReactions) + + " ||| " + + "Participating synthons: " + + ".".join([x.smiles for x in comb.participatingSynthon]) + ) if self.directParentsSynthPathways: print("Parent reagent sets: ") for comb in self.directParentsSynthPathways: - print("Reactions: " + comb.name + " " + "->".join(comb.synthPathwayReactions) + " ||| " + "Participating synthons: " + ".".join([x.smiles for x in comb.participatingSynthon])) + print( + "Reactions: " + + comb.name + + " " + + "->".join(comb.synthPathwayReactions) + + " ||| " + + "Participating synthons: " + + ".".join([x.smiles for x in comb.participatingSynthon]) + ) for synth in self.participatingSynthon: synth.printSynthonInfo() - def getSynthonsForAnaloguesGeneration(self, SynthLib, simTh, strictAvailabilityMode = True): + def getSynthonsForAnaloguesGeneration(self, SynthLib, simTh, strictAvailabilityMode=True): synthonsDict = {} SynthonsForAnaloguesSynthesis = set() if not strictAvailabilityMode: for ind, synth in enumerate(self.participatingSynthon): if "Reagent_" + str(ind + 1) not in synthonsDict: - synthonsDict["Reagent_" + str(ind + 1)] = { "synthons": [], "bivalentN": synth.bivalentN } + synthonsDict["Reagent_" + str(ind + 1)] = { + "synthons": [], + "bivalentN": synth.bivalentN, + } synthonsDict["Reagent_" + str(ind + 1)]["synthons"].append(synth.smiles) if not synth.correspondingBB: SynthonsForAnaloguesSynthesis.add(synth.smiles + " missingBB originalBB\n") else: - SynthonsForAnaloguesSynthesis.add(synth.smiles + " " + synth.correspondingBB + " originalBB\n") + SynthonsForAnaloguesSynthesis.add( + synth.smiles + " " + synth.correspondingBB + " originalBB\n" + ) if not synth.bbAnalogues and synth.correspondingBB: synth.searchForSynthonAnalogues(SynthLib, simTh) if synth.bbAnalogues: for analog in synth.bbAnalogues: - SynthonsForAnaloguesSynthesis.add(analog + " " + synth.bbAnalogues[analog] + " " + synth.smiles + " analog\n") + SynthonsForAnaloguesSynthesis.add( + analog + + " " + + synth.bbAnalogues[analog] + + " " + + synth.smiles + + " analog\n" + ) synthonsDict["Reagent_" + str(ind + 1)]["synthons"].append(analog) return synthonsDict, SynthonsForAnaloguesSynthesis else: for ind, synth in enumerate(self.participatingSynthon): if "Reagent_" + str(ind + 1) not in synthonsDict: - synthonsDict["Reagent_" + str(ind + 1)] = { "synthons": [], "bivalentN": synth.bivalentN } + synthonsDict["Reagent_" + str(ind + 1)] = { + "synthons": [], + "bivalentN": synth.bivalentN, + } if not synth.bbAnalogues and synth.correspondingBB: synth.searchForSynthonAnalogues(SynthLib, simTh) if synth.correspondingBB: - SynthonsForAnaloguesSynthesis.add(synth.smiles + " " + synth.correspondingBB + " originalBB\n") + SynthonsForAnaloguesSynthesis.add( + synth.smiles + " " + synth.correspondingBB + " originalBB\n" + ) synthonsDict["Reagent_" + str(ind + 1)]["synthons"].append(synth.smiles) if synth.bbAnalogues: for analog in synth.bbAnalogues: - SynthonsForAnaloguesSynthesis.add(analog + " " + synth.bbAnalogues[analog] + " " + synth.smiles + " analog\n") + SynthonsForAnaloguesSynthesis.add( + analog + + " " + + synth.bbAnalogues[analog] + + " " + + synth.smiles + + " analog\n" + ) synthonsDict["Reagent_" + str(ind + 1)]["synthons"].append(analog) if len(synthonsDict["Reagent_" + str(ind + 1)]["synthons"]) == 0: return None, None return synthonsDict, SynthonsForAnaloguesSynthesis + class enumeration: - def __init__(self, outDir, Synthons = None, reactionSMARTS = None, maxNumberOfReactedSynthons=6, MWupperTh=None, MWlowerTh=None, - desiredNumberOfNewMols = 1000, nCores=1, analoguesEnumeration=False): - if analoguesEnumeration and Synthons!=None: + def __init__( + self, + outDir, + Synthons=None, + reactionSMARTS=None, + maxNumberOfReactedSynthons=6, + MWupperTh=None, + MWlowerTh=None, + desiredNumberOfNewMols=1000, + nCores=1, + analoguesEnumeration=False, + ): + if analoguesEnumeration and Synthons != None: self.__Synthons = None self.__monoFuncBB = None self.__poliFuncBB = None @@ -243,11 +377,11 @@ def __init__(self, outDir, Synthons = None, reactionSMARTS = None, maxNumberOfRe self.__poliFuncBB = [] self.__biFuncBB = [] self.__maxNumberOfReactedSynthons = maxNumberOfReactedSynthons - self.__MWfiltration=False + self.__MWfiltration = False if MWupperTh: self.__MWupperTh = MWupperTh self.__MWfiltration = True - if MWlowerTh: + if MWlowerTh: self.__MWlowerTh = MWlowerTh self.__MWfiltration = True self.__reactions = [] @@ -258,31 +392,54 @@ def __init__(self, outDir, Synthons = None, reactionSMARTS = None, maxNumberOfRe self.__genNonUniqMols = 0 self.results = set() self.allReconstructedMols = [] - self.__marksCombinations = {'C:10': ['N:20', 'O:20', 'C:20', 'c:20', 'n:20', 'S:20'], - 'c:10': ['N:20', 'O:20', 'C:20', 'c:20', 'n:20', 'S:20'], - 'c:20': ['N:11', 'C:10', 'c:10'], 'C:20': ['C:10', 'c:10'], - 'c:21': ['N:20', 'O:20', 'n:20'], 'C:21': ['N:20', 'n:20'], - 'N:20': ['C:10', 'c:10', 'C:21', 'c:21', 'S:10'], 'N:11': ['c:20'], - 'n:20': ['C:10', 'c:10', 'C:21', 'c:21'], 'O:20': ['C:10', 'c:10', 'c:21'], - 'S:20': ['C:10', 'c:10'], 'S:10': ['N:20'], 'C:30': ['C:40', 'N:40'], - 'C:40': ['C:30'], 'C:50': ['C:50'], 'C:70': ['C:60', 'c:60'], - 'c:60':['C:70'], 'C:60': ['C:70'], 'N:40': ['C:30'] } - + self.__marksCombinations = { + "C:10": ["N:20", "O:20", "C:20", "c:20", "n:20", "S:20"], + "c:10": ["N:20", "O:20", "C:20", "c:20", "n:20", "S:20"], + "c:20": ["N:11", "C:10", "c:10"], + "C:20": ["C:10", "c:10"], + "c:21": ["N:20", "O:20", "n:20"], + "C:21": ["N:20", "n:20"], + "N:20": ["C:10", "c:10", "C:21", "c:21", "S:10"], + "N:11": ["c:20"], + "n:20": ["C:10", "c:10", "C:21", "c:21"], + "O:20": ["C:10", "c:10", "c:21"], + "S:20": ["C:10", "c:10"], + "S:10": ["N:20"], + "C:30": ["C:40", "N:40"], + "C:40": ["C:30"], + "C:50": ["C:50"], + "C:70": ["C:60", "c:60"], + "c:60": ["C:70"], + "C:60": ["C:70"], + "N:40": ["C:30"], + } - def getReconstructedMols(self, allowedToRunSubprocesses=False, randomSeed=None, seed = (0,0), mainRun = True): + def getReconstructedMols( + self, allowedToRunSubprocesses=False, randomSeed=None, seed=(0, 0), mainRun=True + ): pat = re.compile("\[\w*:\w*\]") - if randomSeed==None: + if randomSeed == None: seed, randomSeed = self.__getRandomSeed(seed) if randomSeed: allowedMarks = [] - for key in [Chem.MolToSmiles(randomSeed, canonical=True)[m.start() + 1] + ":" + Chem.MolToSmiles(randomSeed, canonical=True)[m.end() - 3:m.end() - 1] - for m in re.finditer(pat, Chem.MolToSmiles(randomSeed, canonical=True))]: + for key in [ + Chem.MolToSmiles(randomSeed, canonical=True)[m.start() + 1] + + ":" + + Chem.MolToSmiles(randomSeed, canonical=True)[m.end() - 3 : m.end() - 1] + for m in re.finditer(pat, Chem.MolToSmiles(randomSeed, canonical=True)) + ]: allowedMarks.extend(self.__marksCombinations[key]) allowedMarks = set(allowedMarks) Pool = [] for partner in self.__poliFuncBB + self.__biFuncBB + self.__monoFuncBB: - partnerMarks = set([Chem.MolToSmiles(partner, canonical=True)[m.start() + 1] + ":" + Chem.MolToSmiles(partner, canonical=True)[m.end() - 3:m.end() - 1] - for m in re.finditer(pat, Chem.MolToSmiles(partner, canonical=True))]) + partnerMarks = set( + [ + Chem.MolToSmiles(partner, canonical=True)[m.start() + 1] + + ":" + + Chem.MolToSmiles(partner, canonical=True)[m.end() - 3 : m.end() - 1] + for m in re.finditer(pat, Chem.MolToSmiles(partner, canonical=True)) + ] + ) if allowedMarks.intersection(partnerMarks): if allowedToRunSubprocesses: Pool, nAlive = self.__countAndMergeActiveThreads(Pool) @@ -290,14 +447,30 @@ def getReconstructedMols(self, allowedToRunSubprocesses=False, randomSeed=None, time.sleep(30) Pool, nAlive = self.__countAndMergeActiveThreads(Pool) queue = Queue() - process = Process(target=self.__molReconsrtuction, args=(randomSeed, partner, 1, None, queue,)) + process = Process( + target=self.__molReconsrtuction, + args=( + randomSeed, + partner, + 1, + None, + queue, + ), + ) process.start() - Pool.append([process, queue, False]) # [the process, queue, was it already joined] + Pool.append( + [process, queue, False] + ) # [the process, queue, was it already joined] if self.__genNonUniqMols >= self.__desiredNumberOfNewMols: break else: - self.results.update(self.__molReconsrtuction(randomSeed, partner, 1, queue=None)) - print("Number of so far reconstructed unique molecules = " + str(len(self.results))) + self.results.update( + self.__molReconsrtuction(randomSeed, partner, 1, queue=None) + ) + print( + "Number of so far reconstructed unique molecules = " + + str(len(self.results)) + ) if len(self.results) >= self.__desiredNumberOfNewMols: break if allowedToRunSubprocesses: @@ -308,20 +481,30 @@ def getReconstructedMols(self, allowedToRunSubprocesses=False, randomSeed=None, whileCount = 0 while mainRun: - if not allowedToRunSubprocesses and len(self.results) >= self.__desiredNumberOfNewMols : + if not allowedToRunSubprocesses and len(self.results) >= self.__desiredNumberOfNewMols: break - elif allowedToRunSubprocesses and self.__genNonUniqMols >= self.__desiredNumberOfNewMols : + elif ( + allowedToRunSubprocesses and self.__genNonUniqMols >= self.__desiredNumberOfNewMols + ): break whileCount += 1 seed = list(seed) - seed[0] +=1 + seed[0] += 1 seed = tuple(seed) seed, randomSeed = self.__getRandomSeed(seed) if randomSeed is None: break - self.results.update(self.getReconstructedMols(allowedToRunSubprocesses, randomSeed, seed, mainRun = False)) + self.results.update( + self.getReconstructedMols(allowedToRunSubprocesses, randomSeed, seed, mainRun=False) + ) d_names, f_names, main_dir = listDir(self.__outDir) - with open(os.path.join(self.__outDir, "FinalOut_allEnumeratedCompounds_DuplicatesCanBePresent.smi"), "ab") as out: + with open( + os.path.join( + self.__outDir, + "FinalOut_allEnumeratedCompounds_DuplicatesCanBePresent.smi", + ), + "ab", + ) as out: for file in f_names: if "temp_" in file: with open(os.path.join(self.__outDir, file), "rb") as f: @@ -336,27 +519,48 @@ def AnaloguesGeneration(self): reaction = self.__reactions[rid] for ind1 in range(1, len(self.__Synthons) + 1): for ind2 in range(ind1 + 1, len(self.__Synthons) + 1): - SynthonsetPartner = 'Reagent_' + str(ind2) - Synthonset = 'Reagent_' + str(ind1) - if self.__checkBB_reactionCombination(self.__Synthons[Synthonset][0], self.__Synthons[SynthonsetPartner][0], reaction): + SynthonsetPartner = "Reagent_" + str(ind2) + Synthonset = "Reagent_" + str(ind1) + if self.__checkBB_reactionCombination( + self.__Synthons[Synthonset][0], + self.__Synthons[SynthonsetPartner][0], + reaction, + ): for firstBB in self.__Synthons[Synthonset]: for secondBB in self.__Synthons[SynthonsetPartner]: SynthonsetsUsed = set() SynthonsetsUsed.add(Synthonset) SynthonsetsUsed.add(SynthonsetPartner) - self.results.update(self.__molAnaloguesLibEnumeration(reagent=firstBB, partner=secondBB, - numberOfBBalreadyReacted=1, SynthonsetsUsed=SynthonsetsUsed, reactionToUse = rid)) + self.results.update( + self.__molAnaloguesLibEnumeration( + reagent=firstBB, + partner=secondBB, + numberOfBBalreadyReacted=1, + SynthonsetsUsed=SynthonsetsUsed, + reactionToUse=rid, + ) + ) if len(self.results) >= self.__desiredNumberOfNewMols: return self.results return self.results else: - print("Separate reconstructur should be evocken for Molecule analogues generation " - "from available BBs (set analoguesEnumeration=True) and for " - "simple molecules enumeration from the list of BBs (set analoguesEnumeration=False).") + print( + "Separate reconstructur should be evocken for Molecule analogues generation " + "from available BBs (set analoguesEnumeration=True) and for " + "simple molecules enumeration from the list of BBs (set analoguesEnumeration=False)." + ) exit() - def __molAnaloguesLibEnumeration(self, reagent, partner, numberOfBBalreadyReacted, - SynthonsetsUsed, reactionToUse, usedReactions = None, firstLaunch=True): + def __molAnaloguesLibEnumeration( + self, + reagent, + partner, + numberOfBBalreadyReacted, + SynthonsetsUsed, + reactionToUse, + usedReactions=None, + firstLaunch=True, + ): if not usedReactions: usedReactions = [] allProducts = set() @@ -378,20 +582,35 @@ def __molAnaloguesLibEnumeration(self, reagent, partner, numberOfBBalreadyReacte if rid not in newUsedReactions: reaction = self.__reactions[rid] for SynthonsetPartner in self.__Synthons: - if SynthonsetPartner not in SynthonsetsUsed and self.__checkBB_reactionCombination(prod, - self.__Synthons[SynthonsetPartner][0], reaction): + if ( + SynthonsetPartner not in SynthonsetsUsed + and self.__checkBB_reactionCombination( + prod, + self.__Synthons[SynthonsetPartner][0], + reaction, + ) + ): for secondBB in self.__Synthons[SynthonsetPartner]: newSynthonsetsUsed = set() for i in SynthonsetsUsed: newSynthonsetsUsed.add(i) newSynthonsetsUsed.add(SynthonsetPartner) - subResults = self.__molAnaloguesLibEnumeration(reagent=prod, - partner=secondBB, numberOfBBalreadyReacted=numberOfBBalreadyReacted +1, - SynthonsetsUsed=newSynthonsetsUsed, reactionToUse=rid, usedReactions = newUsedReactions, - firstLaunch=False) + subResults = self.__molAnaloguesLibEnumeration( + reagent=prod, + partner=secondBB, + numberOfBBalreadyReacted=numberOfBBalreadyReacted + + 1, + SynthonsetsUsed=newSynthonsetsUsed, + reactionToUse=rid, + usedReactions=newUsedReactions, + firstLaunch=False, + ) if subResults: allProducts.update(subResults) - if len(allProducts)>=self.__desiredNumberOfNewMols and firstLaunch: + if ( + len(allProducts) >= self.__desiredNumberOfNewMols + and firstLaunch + ): return list(allProducts) elif not functionality and numberOfBBalreadyReacted + 1 >= len(self.__Synthons): allProducts.add(prodSMILES) @@ -410,12 +629,19 @@ def __checkBB_reactionCombination(self, reagent, partner, reaction): else: return False - def __molReconsrtuction(self, reagent, partner, numberOfBBalreadyReacted, alreadyUsedReactionsInds=None, queue=None): + def __molReconsrtuction( + self, + reagent, + partner, + numberOfBBalreadyReacted, + alreadyUsedReactionsInds=None, + queue=None, + ): if not alreadyUsedReactionsInds: alreadyUsedReactionsInds = set() pat = re.compile("\[\w*:\w*\]") allProducts = [] - for id,reaction in enumerate(self.__reactions): + for id, reaction in enumerate(self.__reactions): if id in alreadyUsedReactionsInds: continue products = reaction.RunReactants((reagent, partner)) @@ -431,46 +657,103 @@ def __molReconsrtuction(self, reagent, partner, numberOfBBalreadyReacted, alrea prod.GetRingInfo().NumRings() if functionality: allowedMarks = [] - for key in [prodSMILES[m.start() + 1] + ":" + prodSMILES[m.end() - 3:m.end() - 1] for m in - re.finditer(pat, prodSMILES)]: + for key in [ + prodSMILES[m.start() + 1] + + ":" + + prodSMILES[m.end() - 3 : m.end() - 1] + for m in re.finditer(pat, prodSMILES) + ]: allowedMarks.extend(self.__marksCombinations[key]) allowedMarks = set(allowedMarks) - if numberOfBBalreadyReacted + 1 <= self.__maxNumberOfReactedSynthons - functionality: - if numberOfBBalreadyReacted + 1 == self.__maxNumberOfReactedSynthons - functionality: + if ( + numberOfBBalreadyReacted + 1 + <= self.__maxNumberOfReactedSynthons - functionality + ): + if ( + numberOfBBalreadyReacted + 1 + == self.__maxNumberOfReactedSynthons - functionality + ): for newPartner in self.__monoFuncBB: - partnerMarks = set([ - Chem.MolToSmiles(newPartner, canonical=True)[m.start() + 1] + ":" + Chem.MolToSmiles(newPartner, canonical=True)[ - m.end() - 3:m.end() - 1] for m in re.finditer(pat, Chem.MolToSmiles(newPartner, canonical=True))]) + partnerMarks = set( + [ + Chem.MolToSmiles(newPartner, canonical=True)[ + m.start() + 1 + ] + + ":" + + Chem.MolToSmiles(newPartner, canonical=True)[ + m.end() - 3 : m.end() - 1 + ] + for m in re.finditer( + pat, + Chem.MolToSmiles(newPartner, canonical=True), + ) + ] + ) if allowedMarks.intersection(partnerMarks): alreadyUsedReactionsInds.add(id) - subResults = self.__molReconsrtuction(prod, newPartner, numberOfBBalreadyReacted+1, - alreadyUsedReactionsInds=alreadyUsedReactionsInds) + subResults = self.__molReconsrtuction( + prod, + newPartner, + numberOfBBalreadyReacted + 1, + alreadyUsedReactionsInds=alreadyUsedReactionsInds, + ) if subResults: allProducts.extend(subResults) - elif functionality>3: + elif functionality > 3: for newPartner in self.__monoFuncBB + self.__biFuncBB: - partnerMarks = set([ - Chem.MolToSmiles(newPartner, canonical=True)[m.start() + 1] + ":" + Chem.MolToSmiles( - newPartner, canonical=True)[m.end() - 3:m.end() - 1] - for m in re.finditer(pat, Chem.MolToSmiles(newPartner, canonical=True))]) + partnerMarks = set( + [ + Chem.MolToSmiles(newPartner, canonical=True)[ + m.start() + 1 + ] + + ":" + + Chem.MolToSmiles(newPartner, canonical=True)[ + m.end() - 3 : m.end() - 1 + ] + for m in re.finditer( + pat, + Chem.MolToSmiles(newPartner, canonical=True), + ) + ] + ) if allowedMarks.intersection(partnerMarks): alreadyUsedReactionsInds.add(id) - subResults = self.__molReconsrtuction(prod, newPartner, numberOfBBalreadyReacted+1, - alreadyUsedReactionsInds=alreadyUsedReactionsInds) + subResults = self.__molReconsrtuction( + prod, + newPartner, + numberOfBBalreadyReacted + 1, + alreadyUsedReactionsInds=alreadyUsedReactionsInds, + ) if subResults: allProducts.extend(subResults) allProducts = list(set(allProducts)) else: - for newPartner in self.__monoFuncBB + self.__biFuncBB + self.__poliFuncBB: - partnerMarks = set([ - Chem.MolToSmiles(newPartner, canonical=True)[m.start() + 1] + ":" + Chem.MolToSmiles( - newPartner, canonical=True)[ - m.end() - 3:m.end() - 1] - for m in re.finditer(pat, Chem.MolToSmiles(newPartner, canonical=True))]) + for newPartner in ( + self.__monoFuncBB + self.__biFuncBB + self.__poliFuncBB + ): + partnerMarks = set( + [ + Chem.MolToSmiles(newPartner, canonical=True)[ + m.start() + 1 + ] + + ":" + + Chem.MolToSmiles(newPartner, canonical=True)[ + m.end() - 3 : m.end() - 1 + ] + for m in re.finditer( + pat, + Chem.MolToSmiles(newPartner, canonical=True), + ) + ] + ) if allowedMarks.intersection(partnerMarks): alreadyUsedReactionsInds.add(id) - subResults = self.__molReconsrtuction(prod, newPartner, numberOfBBalreadyReacted+1, - alreadyUsedReactionsInds=alreadyUsedReactionsInds) + subResults = self.__molReconsrtuction( + prod, + newPartner, + numberOfBBalreadyReacted + 1, + alreadyUsedReactionsInds=alreadyUsedReactionsInds, + ) if subResults: allProducts.extend(subResults) allProducts = list(set(allProducts)) @@ -503,7 +786,9 @@ def __perpSynthonsAndReactions(self, Synthons, reactionSMARTS, analoguesEnumerat self.__Synthons = dict() for subset in Synthons: bbsList = [Chem.MolFromSmiles(x) for x in Synthons[subset]["synthons"]] - self.__Synthons[subset] = self.__PrepMolForReconstruction(bbsList, Synthons[subset]["bivalentN"]) + self.__Synthons[subset] = self.__PrepMolForReconstruction( + bbsList, Synthons[subset]["bivalentN"] + ) else: monoFuncBB = [] biFuncBB = [] @@ -516,17 +801,29 @@ def __perpSynthonsAndReactions(self, Synthons, reactionSMARTS, analoguesEnumerat else: poliFuncBB.append(bb) if monoFuncBB == None: - raise ValueError("No monofunctional building blocks are provided for enumeration") + raise ValueError( + "No monofunctional building blocks are provided for enumeration" + ) else: - self.__monoFuncBB = self.__PrepMolForReconstruction([Chem.MolFromSmiles(x) for x in monoFuncBB]) + self.__monoFuncBB = self.__PrepMolForReconstruction( + [Chem.MolFromSmiles(x) for x in monoFuncBB] + ) random.shuffle(self.__monoFuncBB) if biFuncBB: - self.__biFuncBB = self.__PrepMolForReconstruction([Chem.MolFromSmiles(x) for x in biFuncBB]) + self.__biFuncBB = self.__PrepMolForReconstruction( + [Chem.MolFromSmiles(x) for x in biFuncBB] + ) random.shuffle(self.__biFuncBB) if poliFuncBB: - self.__poliFuncBB = self.__PrepMolForReconstruction([Chem.MolFromSmiles(x) for x in poliFuncBB]) + self.__poliFuncBB = self.__PrepMolForReconstruction( + [Chem.MolFromSmiles(x) for x in poliFuncBB] + ) random.shuffle(self.__poliFuncBB) - self.__Synthons = (self.__poliFuncBB, self.__biFuncBB, self.__monoFuncBB) + self.__Synthons = ( + self.__poliFuncBB, + self.__biFuncBB, + self.__monoFuncBB, + ) if reactionSMARTS == None: raise ValueError("No reactions are provided for enumeration") else: @@ -535,7 +832,7 @@ def __perpSynthonsAndReactions(self, Synthons, reactionSMARTS, analoguesEnumerat reaction = Reactions.ReactionFromSmarts(react) self.__reactions.append(reaction) - def __PrepMolForReconstruction(sefl, molList, bivalentN=False): # list of Mol objects + def __PrepMolForReconstruction(sefl, molList, bivalentN=False): # list of Mol objects newList = [] labels = [10, 20, 30, 40, 50, 60, 70, 21, 11] atomsForMarking = [23, 74, 72, 104, 105, 106, 107, 108, 109] @@ -581,7 +878,7 @@ def __PrepMolForReconstruction(sefl, molList, bivalentN=False): # list of Mol ob break mol = RemoveHs(mol) newList.append(mol) - return newList # list of Mol objects + return newList # list of Mol objects def __getRandomSeed(self, seed): randomSeed = None @@ -608,7 +905,10 @@ def __countAndMergeActiveThreads(self, Pool): # self.results.extend(p[1].get()) # self.results = list(set(self.results)) self.__genNonUniqMols += len(reconstructedMols) - print("Number of so far reconstructed molecules (may contain duplicates) = " + str(self.__genNonUniqMols)) + print( + "Number of so far reconstructed molecules (may contain duplicates) = " + + str(self.__genNonUniqMols) + ) d_names, f_names, main_dir = listDir(self.__outDir) n = 0 for file in f_names: @@ -626,13 +926,30 @@ def __countAndMergeActiveThreads(self, Pool): Pool.pop(i) return Pool, nAlive -class fragmentation: - def __init__(self, fragmentationMode="use_all", reactionsToWorkWith = "R1-R13", maxNumberOfReactionCentersPerFragment = 3, - MaxNumberOfStages = 5, FragmentsToIgnore = None, - setupFile = os.path.join(os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], "config" , "Setup.xml"), - macroCycleSetupFile = os.path.join(os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], "config" , "SetupForMacrocycles.xml"), - FindAnaloguesOfMissingSynthons = False, parsedSynthLib = False, SynthLibrary=None, Ro2SynthonsFiltration = False): +class fragmentation: + def __init__( + self, + fragmentationMode="use_all", + reactionsToWorkWith="R1-R13", + maxNumberOfReactionCentersPerFragment=3, + MaxNumberOfStages=5, + FragmentsToIgnore=None, + setupFile=os.path.join( + os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], + "config", + "Setup.xml", + ), + macroCycleSetupFile=os.path.join( + os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], + "config", + "SetupForMacrocycles.xml", + ), + FindAnaloguesOfMissingSynthons=False, + parsedSynthLib=False, + SynthLibrary=None, + Ro2SynthonsFiltration=False, + ): max_rec = 0x100000 # May segfault without this line. 0x100 is a guess at the size of each stack frame. resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY]) @@ -648,19 +965,38 @@ def __init__(self, fragmentationMode="use_all", reactionsToWorkWith = "R1-R13", self.__setupFile = setupFile self.__reactionSetup, self.__reactionsToWorkWith = self.__getSetup(reactionsToWorkWith) self.__macroCycleSetupFile = macroCycleSetupFile - self.__reactionMacroCycleSetup, self.__macroCyclicReactionsToWorkWith = self.__getMacroCycleSetup(reactionsToWorkWith.replace("R", "MR")) - forbiddenMarks = [{'[N:11]', '[c:10]'}, {'[N:11]', '[c:20]'}, {'[N:11]', '[O:20]'},{'[N:11]', '[C:30]'}, - {'[N:11]', '[C:10]'} , {'[N:11]', '[N:11]'}, {'[N:11]', '[c:70]'}, - {'[N:11]', '[C:40]'}, {'[N:11]', '[C:50]'}, {'[N:11]', '[S:10]'}, - {'[c:11]', '[C:20]'}, {'[c:70]', '[c:20]'}, - {'[c:21]', '[C:60]'},{'[c:21]', '[N:11]'}, {'[c:20]', '[c:21]'}, {'[c:70]', '[C:21]'}, {'[c:20]', '[C:21]'}] + ( + self.__reactionMacroCycleSetup, + self.__macroCyclicReactionsToWorkWith, + ) = self.__getMacroCycleSetup(reactionsToWorkWith.replace("R", "MR")) + forbiddenMarks = [ + {"[N:11]", "[c:10]"}, + {"[N:11]", "[c:20]"}, + {"[N:11]", "[O:20]"}, + {"[N:11]", "[C:30]"}, + {"[N:11]", "[C:10]"}, + {"[N:11]", "[N:11]"}, + {"[N:11]", "[c:70]"}, + {"[N:11]", "[C:40]"}, + {"[N:11]", "[C:50]"}, + {"[N:11]", "[S:10]"}, + {"[c:11]", "[C:20]"}, + {"[c:70]", "[c:20]"}, + {"[c:21]", "[C:60]"}, + {"[c:21]", "[N:11]"}, + {"[c:20]", "[c:21]"}, + {"[c:70]", "[C:21]"}, + {"[c:20]", "[C:21]"}, + ] self.forbiddenMarks = set([frozenset(x) for x in forbiddenMarks]) if SynthLibrary: self.FindAnaloguesOfMissingSynthons = FindAnaloguesOfMissingSynthons if parsedSynthLib: self.SynthLib = SynthLibrary else: - print("Processing BB library. It may take a few minutes, depending on the library size") + print( + "Processing BB library. It may take a few minutes, depending on the library size" + ) self.SynthLib = self.__readSynthLib(SynthLibrary, Ro2SynthonsFiltration) else: self.SynthLib = None @@ -681,23 +1017,35 @@ def cutWithHierarchyStorred(self, mol): if allSyntheticPathways[comb].cutLevel == 1: for synth in allSyntheticPathways[comb].participatingSynthon: if synth != allSynthons["InitMol"]: - self.__cutOneSynthonHierarchically(allSynthons[synth.smiles], allSyntheticPathways[comb], - allSynthons, allSyntheticPathways, cutLevel=2) - if self.__fragmentationMode == "one_by_one" and allSyntheticPathways!=startingCombintaions: + self.__cutOneSynthonHierarchically( + allSynthons[synth.smiles], + allSyntheticPathways[comb], + allSynthons, + allSyntheticPathways, + cutLevel=2, + ) + if ( + self.__fragmentationMode == "one_by_one" + and allSyntheticPathways != startingCombintaions + ): break - return allSyntheticPathways,allSynthons + return allSyntheticPathways, allSynthons def getReactionForReconstruction(self, reactionList=None): reactionForReconstruction = [] if reactionList: for key in reactionList: if "R14" not in key: - reactionForReconstruction.append(self.__reactionSetup[key]["ReconstructionSMARTS"]) + reactionForReconstruction.append( + self.__reactionSetup[key]["ReconstructionSMARTS"] + ) else: for key in self.__reactionSetup: if "R14" not in key: - reactionForReconstruction.append(self.__reactionSetup[key]["ReconstructionSMARTS"]) + reactionForReconstruction.append( + self.__reactionSetup[key]["ReconstructionSMARTS"] + ) return reactionForReconstruction def __readSynthLib(self, SynthLib, Ro2Filtration): @@ -714,19 +1062,32 @@ def __readSynthLib(self, SynthLib, Ro2Filtration): LogP = MolLogP(mol) HDC = CalcNumHBD(mol) HAC = CalcNumHBA(mol) - if MolW>200 or LogP > 2 or HDC > 2 or HAC > 4: + if MolW > 200 or LogP > 2 or HDC > 2 or HAC > 4: continue availableSynthons[sline.split()[0]] = {} availableSynthons[sline.split()[0]]["BBs"] = sline.split()[1] if self.FindAnaloguesOfMissingSynthons: - availableSynthons[sline.split()[0]]["fp_b"] = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) + availableSynthons[sline.split()[0]][ + "fp_b" + ] = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) availableSynthons[sline.split()[0]]["n_atoms"] = mol.GetNumAtoms() availableSynthons[sline.split()[0]]["n_rings"] = CalcNumRings(mol) availableSynthons[sline.split()[0]]["marks"] = sorted( - [sline.split()[0][m.start():m.start() + 2] + sline.split()[0][m.end() - 4:m.end()] for m in - re.finditer(pat, sline.split()[0])]) - availableSynthons[sline.split()[0]]["marksVallences"] = "+".join(sorted([atom.GetSymbol() + ":" + - str(atom.GetTotalDegree()) for atom in mol.GetAtoms() if atom.GetAtomMapNum() != 0])) + [ + sline.split()[0][m.start() : m.start() + 2] + + sline.split()[0][m.end() - 4 : m.end()] + for m in re.finditer(pat, sline.split()[0]) + ] + ) + availableSynthons[sline.split()[0]]["marksVallences"] = "+".join( + sorted( + [ + atom.GetSymbol() + ":" + str(atom.GetTotalDegree()) + for atom in mol.GetAtoms() + if atom.GetAtomMapNum() != 0 + ] + ) + ) print("Lib BB reading time:") print(datetime.datetime.now() - fragBegTime) return availableSynthons @@ -747,9 +1108,12 @@ def __getSetup(self, reactionsToWorkWith): for block in reactionsToWorkWith.split(";"): reaction_list.append(self.__getReactionList(block, allReactions)) elif self.__fragmentationMode == "exclude_some": - reaction_list = [reaction for reaction in allReactions if - reaction not in self.__getReactionList(reactionsToWorkWith, allReactions)] - #reaction_list -> Ids of reactions that were specified by user to be used for the fragmentation + reaction_list = [ + reaction + for reaction in allReactions + if reaction not in self.__getReactionList(reactionsToWorkWith, allReactions) + ] + # reaction_list -> Ids of reactions that were specified by user to be used for the fragmentation return reactionSetup, reaction_list def __getMacroCycleSetup(self, reactionsToWorkWith): @@ -762,15 +1126,21 @@ def __getMacroCycleSetup(self, reactionsToWorkWith): if self.__fragmentationMode == "use_all": reaction_list = allReactions[:-1] elif self.__fragmentationMode == "include_only" or self.__fragmentationMode == "one_by_one": - reaction_list = self.__getReactionList(reactionsToWorkWith, allReactions, MacroCycles=True) + reaction_list = self.__getReactionList( + reactionsToWorkWith, allReactions, MacroCycles=True + ) elif self.__fragmentationMode == "block by block": reaction_list = [] for block in self.__macroCyclicReactionsToWorkWith.split(";"): reaction_list.append(self.__getReactionList(block, allReactions, MacroCycles=True)) elif self.__fragmentationMode == "exclude_some": - reaction_list = [reaction for reaction in allReactions if - reaction not in self.__getReactionList(reactionsToWorkWith, allReactions, MacroCycles=True)] - #reaction_list -> Ids of reactions that were specified by user to be used for the fragmentation + reaction_list = [ + reaction + for reaction in allReactions + if reaction + not in self.__getReactionList(reactionsToWorkWith, allReactions, MacroCycles=True) + ] + # reaction_list -> Ids of reactions that were specified by user to be used for the fragmentation return reactionSetup, reaction_list def __getReactionSMARTS(self, N_SyntOn_setup: ET.Element): @@ -779,13 +1149,15 @@ def __getReactionSMARTS(self, N_SyntOn_setup: ET.Element): if child.tag == "AvailableReactions": for ch in child: for subCh in ch: - if subCh.get('SMARTS'): + if subCh.get("SMARTS"): reactionSetup[subCh.tag] = {} - reactionSetup[subCh.tag]["Name"] = subCh.get('name') - reactionSetup[subCh.tag]["SMARTS"] = subCh.get('SMARTS') - reactionSetup[subCh.tag]["Labels"] = subCh.get('Labels') - reactionSetup[subCh.tag]["ReconstructionSMARTS"] = subCh.get('ReconstructionReaction') - allReactions = list(reactionSetup.keys()) #allReactions -> list of all available reactions + reactionSetup[subCh.tag]["Name"] = subCh.get("name") + reactionSetup[subCh.tag]["SMARTS"] = subCh.get("SMARTS") + reactionSetup[subCh.tag]["Labels"] = subCh.get("Labels") + reactionSetup[subCh.tag]["ReconstructionSMARTS"] = subCh.get( + "ReconstructionReaction" + ) + allReactions = list(reactionSetup.keys()) # allReactions -> list of all available reactions return allReactions, reactionSetup def __getReactionList(self, reactList: str, allReactions: list, MacroCycles=False) -> list: @@ -821,7 +1193,7 @@ def __getReactionList(self, reactList: str, allReactions: list, MacroCycles=Fals reaction_list.extend(allReactions[li:ui]) return reaction_list - def __firstMolCut(self, mol ): + def __firstMolCut(self, mol): mol.UpdatePropertyCache() Chem.GetSymmSSSR(mol) rings = mol.GetRingInfo().AtomRings() @@ -838,15 +1210,19 @@ def __firstMolCut(self, mol ): reactionSetup = self.__reactionSetup allSynthons = {} allSyntheticPathways = {} - allSyntheticPathways["zeroCombin"] = syntheticPathway(name="zeroCombin", reagentsNumber=0, - synthPathwayReactions=None, cutLevel=0) - allSynthons["InitMol"] = synthon(Chem.MolToSmiles(mol, canonical=True), - syntheticPathway={allSyntheticPathways["zeroCombin"]}, cutLevel=0) + allSyntheticPathways["zeroCombin"] = syntheticPathway( + name="zeroCombin", reagentsNumber=0, synthPathwayReactions=None, cutLevel=0 + ) + allSynthons["InitMol"] = synthon( + Chem.MolToSmiles(mol, canonical=True), + syntheticPathway={allSyntheticPathways["zeroCombin"]}, + cutLevel=0, + ) allSyntheticPathways["zeroCombin"].participatingSynthon.append(allSynthons["InitMol"]) cutLevel = 1 cutCount = 1 - for ind,rId in enumerate(reactionsToWorkWith): - cuttingRule = Reactions.ReactionFromSmarts(reactionSetup[rId]['SMARTS']) + for ind, rId in enumerate(reactionsToWorkWith): + cuttingRule = Reactions.ReactionFromSmarts(reactionSetup[rId]["SMARTS"]) products = cuttingRule.RunReactants((mol,)) if products: setsCount = len(products) @@ -857,7 +1233,10 @@ def __firstMolCut(self, mol ): synthonsProdSet = [] Ignore = False for product in prodSet: - if product.GetNumHeavyAtoms() < 3 or Chem.MolToSmiles(product, canonical=True) in self.__SmilesToIgnore: + if ( + product.GetNumHeavyAtoms() < 3 + or Chem.MolToSmiles(product, canonical=True) in self.__SmilesToIgnore + ): Ignore = True break if Ignore: @@ -865,18 +1244,32 @@ def __firstMolCut(self, mol ): # print(rId) for product in prodSet: - Labels = reactionSetup[rId]['Labels'] + Labels = reactionSetup[rId]["Labels"] for lablesSet in Labels.split("|"): if macroCycle: - labledSynthon =self.__getMacroCycleLabledSmiles(product, lablesSet.split(";"), cutCount=None) + labledSynthon = self.__getMacroCycleLabledSmiles( + product, lablesSet.split(";"), cutCount=None + ) else: - labledSynthon = self.__getLabledSmiles(product, lablesSet.split(";"), cutCount=None) - if not labledSynthon or labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment or ( - CalcNumRings(Chem.MolFromSmiles(labledSynthon)) == 0 and - labledSynthon.count(":") > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms()+1) / 3) or \ - (CalcNumRings(Chem.MolFromSmiles(labledSynthon)) != 0 and labledSynthon.count( - ":") > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms()+1) / 2): - #if labledSynthon and labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment: + labledSynthon = self.__getLabledSmiles( + product, lablesSet.split(";"), cutCount=None + ) + if ( + not labledSynthon + or labledSynthon.count(":") + > self.__maxNumberOfReactionCentersPerFragment + or ( + CalcNumRings(Chem.MolFromSmiles(labledSynthon)) == 0 + and labledSynthon.count(":") + > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() + 1) / 3 + ) + or ( + CalcNumRings(Chem.MolFromSmiles(labledSynthon)) != 0 + and labledSynthon.count(":") + > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() + 1) / 2 + ) + ): + # if labledSynthon and labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment: Ignore = True break if labledSynthon: @@ -888,37 +1281,60 @@ def __firstMolCut(self, mol ): reagSetKey = ".".join(synthonsProdSet) if reagSetKey not in allSyntheticPathways: if self.SynthLib: - allSyntheticPathways[reagSetKey] = syntheticPathway(name=reagSetName, - reagentsNumber=len(prodSet), synthPathwayReactions=[reactionSetup[rId]['Name']], - cutLevel=1, SynthLibProvided = True) + allSyntheticPathways[reagSetKey] = syntheticPathway( + name=reagSetName, + reagentsNumber=len(prodSet), + synthPathwayReactions=[reactionSetup[rId]["Name"]], + cutLevel=1, + SynthLibProvided=True, + ) else: - allSyntheticPathways[reagSetKey] = syntheticPathway(name=reagSetName, - reagentsNumber=len(prodSet), synthPathwayReactions=[reactionSetup[rId]['Name']], - cutLevel=1) - allSyntheticPathways[reagSetKey].directParentsSynthPathways.append(allSyntheticPathways["zeroCombin"]) + allSyntheticPathways[reagSetKey] = syntheticPathway( + name=reagSetName, + reagentsNumber=len(prodSet), + synthPathwayReactions=[reactionSetup[rId]["Name"]], + cutLevel=1, + ) + allSyntheticPathways[reagSetKey].directParentsSynthPathways.append( + allSyntheticPathways["zeroCombin"] + ) else: continue for synth in synthonsProdSet: if synth not in allSynthons: if self.SynthLib: - allSynthons[synth] = synthon(synth, - syntheticPathway=[allSyntheticPathways[reagSetKey]], - cutLevel=cutLevel, SynthLibProvided = True) + allSynthons[synth] = synthon( + synth, + syntheticPathway=[allSyntheticPathways[reagSetKey]], + cutLevel=cutLevel, + SynthLibProvided=True, + ) else: - allSynthons[synth] = synthon(synth, syntheticPathway=[allSyntheticPathways[reagSetKey]], - cutLevel=cutLevel) - elif allSyntheticPathways[reagSetKey] not in set(allSynthons[synth].syntheticPathway): - allSynthons[synth].syntheticPathway.append(allSyntheticPathways[reagSetKey]) - allSyntheticPathways[reagSetKey].participatingSynthon.append(allSynthons[synth]) + allSynthons[synth] = synthon( + synth, + syntheticPathway=[allSyntheticPathways[reagSetKey]], + cutLevel=cutLevel, + ) + elif allSyntheticPathways[reagSetKey] not in set( + allSynthons[synth].syntheticPathway + ): + allSynthons[synth].syntheticPathway.append( + allSyntheticPathways[reagSetKey] + ) + allSyntheticPathways[reagSetKey].participatingSynthon.append( + allSynthons[synth] + ) if self.__fragmentationMode == "one_by_one": allSynthons[synth].rIdsToGetIt.append(ind) - #allSyntheticPathways[reagSetKey].printDetailedReagentsSetInfo() + # allSyntheticPathways[reagSetKey].printDetailedReagentsSetInfo() if self.__fragmentationMode == "one_by_one": return allSynthons, allSyntheticPathways, ind return allSynthons, allSyntheticPathways - def __cutOneSynthonHierarchically(self, synthOld, oldComb, allSynthons, allSyntheticPathways, cutLevel): + def __cutOneSynthonHierarchically( + self, synthOld, oldComb, allSynthons, allSyntheticPathways, cutLevel + ): if synthOld.smiles == "CC(Oc1cc(-c2c(C[NH:20]C)nn(C)c2C#N)cnc1N)c1cc(F)ccc1[CH:10]=O": print("*************************************************") mol = Chem.MolFromSmiles(synthOld.smiles) @@ -927,10 +1343,10 @@ def __cutOneSynthonHierarchically(self, synthOld, oldComb, allSynthons, allSynth mol.GetRingInfo().NumRings() synthOld.AtomNumbers = mol.GetNumAtoms() successfulCut = False - for ind,rId in enumerate(self.__reactionsToWorkWith): + for ind, rId in enumerate(self.__reactionsToWorkWith): if self.__fragmentationMode == "one_by_one" and ind < synthOld.rIdsToGetIt[-1]: continue - cuttingRule = Reactions.ReactionFromSmarts(self.__reactionSetup[rId]['SMARTS']) + cuttingRule = Reactions.ReactionFromSmarts(self.__reactionSetup[rId]["SMARTS"]) products = cuttingRule.RunReactants((mol,)) if products: setsCount = len(products) @@ -939,40 +1355,70 @@ def __cutOneSynthonHierarchically(self, synthOld, oldComb, allSynthons, allSynth synthonsProdSet = [] Ignore = False for product in prodSet: - if product.GetNumHeavyAtoms() < 3 or ("[V]" in Chem.MolToSmiles(product, canonical=True) and product.GetNumHeavyAtoms() < 4)\ - or Chem.MolToSmiles(product, canonical=True) in self.__SmilesToIgnore: + if ( + product.GetNumHeavyAtoms() < 3 + or ( + "[V]" in Chem.MolToSmiles(product, canonical=True) + and product.GetNumHeavyAtoms() < 4 + ) + or Chem.MolToSmiles(product, canonical=True) in self.__SmilesToIgnore + ): Ignore = True break if Ignore: continue for product in prodSet: - Labels = self.__reactionSetup[rId]['Labels'] + Labels = self.__reactionSetup[rId]["Labels"] for lablesSet in Labels.split("|"): - labledSynthon = self.__getLabledSmiles(product, lablesSet.split(";"), cutCount=None) - if Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" or Chem.MolToSmiles(product) == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1": + labledSynthon = self.__getLabledSmiles( + product, lablesSet.split(";"), cutCount=None + ) + if ( + Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" + or Chem.MolToSmiles(product) + == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1" + ): print("#################################") print(labledSynthon) if not labledSynthon: print(Chem.MolToSmiles(mol, canonical=True)) print(".".join([Chem.MolToSmiles(x, canonical=True) for x in prodSet])) - elif labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment or ( - CalcNumRings(Chem.MolFromSmiles(labledSynthon)) == 0 and - labledSynthon.count(":") > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() +1) / 3) or \ - (CalcNumRings(Chem.MolFromSmiles(labledSynthon)) != 0 and labledSynthon.count( - ":") > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() +1) / 2): - #if labledSynthon and labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment: - if Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" or Chem.MolToSmiles( - product) == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1": + elif ( + labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment + or ( + CalcNumRings(Chem.MolFromSmiles(labledSynthon)) == 0 + and labledSynthon.count(":") + > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() + 1) / 3 + ) + or ( + CalcNumRings(Chem.MolFromSmiles(labledSynthon)) != 0 + and labledSynthon.count(":") + > (Chem.MolFromSmiles(labledSynthon).GetNumHeavyAtoms() + 1) / 2 + ) + ): + # if labledSynthon and labledSynthon.count(":") > self.__maxNumberOfReactionCentersPerFragment: + if ( + Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" + or Chem.MolToSmiles(product) + == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1" + ): print("Ignore1") Ignore = True break pat = re.compile("\[\w*:\w*\]") marks = frozenset( - [labledSynthon[m.start():m.start() + 2] + labledSynthon[m.end() - 4:m.end()] for m in - re.finditer(pat, labledSynthon)]) + [ + labledSynthon[m.start() : m.start() + 2] + + labledSynthon[m.end() - 4 : m.end()] + for m in re.finditer(pat, labledSynthon) + ] + ) if marks in self.forbiddenMarks: - if Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" or Chem.MolToSmiles( - product) == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1": + if ( + Chem.MolToSmiles(product) == "*c1c(C[NH:20]C)nn(C)c1C#N" + or Chem.MolToSmiles(product) + == "*c1cnc(N)c(OC(C)c2cc(F)ccc2[CH:10]=O)c1" + ): print("Ignore2") Ignore = True break @@ -986,21 +1432,34 @@ def __cutOneSynthonHierarchically(self, synthOld, oldComb, allSynthons, allSynth allMarks = [] oldMarksPresentInNewSynthons = [] for labledSynthon in synthonsProdSet: - marksNew = sorted([labledSynthon[m.start():m.start() + 2] + labledSynthon[m.end() - 4:m.end()] for m - in re.finditer(pat, labledSynthon)]) - allMarks.extend([labledSynthon[m.start():m.start() + 2] + labledSynthon[m.end() - 4:m.end()] for m - in re.finditer(pat, labledSynthon)]) + marksNew = sorted( + [ + labledSynthon[m.start() : m.start() + 2] + + labledSynthon[m.end() - 4 : m.end()] + for m in re.finditer(pat, labledSynthon) + ] + ) + allMarks.extend( + [ + labledSynthon[m.start() : m.start() + 2] + + labledSynthon[m.end() - 4 : m.end()] + for m in re.finditer(pat, labledSynthon) + ] + ) for m in synthOld.marks: if m in marksNew: oldMarksPresentInNewSynthons.append(m) - if len(marksNew) == len(synthOld.marks) and marksNew != synthOld.marks: + if ( + len(marksNew) == len(synthOld.marks) + and marksNew != synthOld.marks + ): marksCheckPrev.append(0) else: marksCheckPrev.append(1) if sorted(oldMarksPresentInNewSynthons) == synthOld.marks: marksCheckPrev.append(1) if 1 not in set(marksCheckPrev): - if labledSynthon=="Cn1nc(C[NH:20]C)[cH:21]c1C#N": + if labledSynthon == "Cn1nc(C[NH:20]C)[cH:21]c1C#N": print(synthOld.marks) print(marksNew) print("Continue 1") @@ -1008,92 +1467,143 @@ def __cutOneSynthonHierarchically(self, synthOld, oldComb, allSynthons, allSynth else: allNewMarks = set(allMarks) - checkTotalMarks = [1 for mark in synthOld.marks if mark not in allNewMarks] + checkTotalMarks = [ + 1 for mark in synthOld.marks if mark not in allNewMarks + ] if checkTotalMarks: if labledSynthon == "Cn1nc(C[NH:20]C)[cH:21]c1C#N": print("Continue 2") continue successfulCut = True reagSetName1 = oldComb.name.split("|") - reagSetName1.append(rId + "_" + str(i)) + reagSetName1.append(rId + "_" + str(i)) reagSetName1.sort() reagSetName = "|".join(reagSetName1) reagSetKeyList = [] reagSetKeyList.extend(synthonsProdSet) - reagSetKeyList.extend([synth.smiles for synth in oldComb.participatingSynthon if synthOld != synth]) + reagSetKeyList.extend( + [ + synth.smiles + for synth in oldComb.participatingSynthon + if synthOld != synth + ] + ) reagSetKeyList.sort() reagSetKey = ".".join(reagSetKeyList) if reagSetKey not in allSyntheticPathways: if self.SynthLib: - allSyntheticPathways[reagSetKey] = syntheticPathway(name = reagSetName, - reagentsNumber=len(prodSet)+oldComb.reagentsNumber -1, - synthPathwayReactions=oldComb.synthPathwayReactions, - directParentsSynthPathways=[oldComb], cutLevel=cutLevel, SynthLibProvided = True) + allSyntheticPathways[reagSetKey] = syntheticPathway( + name=reagSetName, + reagentsNumber=len(prodSet) + oldComb.reagentsNumber - 1, + synthPathwayReactions=oldComb.synthPathwayReactions, + directParentsSynthPathways=[oldComb], + cutLevel=cutLevel, + SynthLibProvided=True, + ) else: - allSyntheticPathways[reagSetKey] = syntheticPathway(name=reagSetName, - reagentsNumber=len( - prodSet) + oldComb.reagentsNumber - 1, - synthPathwayReactions=oldComb.synthPathwayReactions, - directParentsSynthPathways=[oldComb], - cutLevel=cutLevel) - if self.__reactionSetup[rId]['Name'] not in set(allSyntheticPathways[reagSetKey].synthPathwayReactions): - allSyntheticPathways[reagSetKey].synthPathwayReactions.append(self.__reactionSetup[rId]['Name']) - oldComb.directChildrenSynthPathways.append(allSyntheticPathways[reagSetKey]) - elif oldComb.name != reagSetName and allSyntheticPathways[reagSetKey] not in oldComb.directParentsSynthPathways: - allSyntheticPathways[reagSetKey].directParentsSynthPathways.append(oldComb) - oldComb.directChildrenSynthPathways.append(allSyntheticPathways[reagSetKey]) + allSyntheticPathways[reagSetKey] = syntheticPathway( + name=reagSetName, + reagentsNumber=len(prodSet) + oldComb.reagentsNumber - 1, + synthPathwayReactions=oldComb.synthPathwayReactions, + directParentsSynthPathways=[oldComb], + cutLevel=cutLevel, + ) + if self.__reactionSetup[rId]["Name"] not in set( + allSyntheticPathways[reagSetKey].synthPathwayReactions + ): + allSyntheticPathways[reagSetKey].synthPathwayReactions.append( + self.__reactionSetup[rId]["Name"] + ) + oldComb.directChildrenSynthPathways.append( + allSyntheticPathways[reagSetKey] + ) + elif ( + oldComb.name != reagSetName + and allSyntheticPathways[reagSetKey] + not in oldComb.directParentsSynthPathways + ): + allSyntheticPathways[reagSetKey].directParentsSynthPathways.append( + oldComb + ) + oldComb.directChildrenSynthPathways.append( + allSyntheticPathways[reagSetKey] + ) continue else: continue for synth in synthonsProdSet: if synth not in allSynthons: if self.SynthLib: - allSynthons[synth] = synthon(synth, - syntheticPathway=[allSyntheticPathways[reagSetKey]], - cutLevel=cutLevel, SynthLibProvided = True) + allSynthons[synth] = synthon( + synth, + syntheticPathway=[allSyntheticPathways[reagSetKey]], + cutLevel=cutLevel, + SynthLibProvided=True, + ) else: - allSynthons[synth] = synthon(synth, - syntheticPathway=[allSyntheticPathways[reagSetKey]], cutLevel=cutLevel) + allSynthons[synth] = synthon( + synth, + syntheticPathway=[allSyntheticPathways[reagSetKey]], + cutLevel=cutLevel, + ) - elif allSyntheticPathways[reagSetKey] not in set(allSynthons[synth].syntheticPathway): - allSynthons[synth].syntheticPathway.append(allSyntheticPathways[reagSetKey]) + elif allSyntheticPathways[reagSetKey] not in set( + allSynthons[synth].syntheticPathway + ): + allSynthons[synth].syntheticPathway.append( + allSyntheticPathways[reagSetKey] + ) - allSyntheticPathways[reagSetKey].participatingSynthon.append(allSynthons[synth]) + allSyntheticPathways[reagSetKey].participatingSynthon.append( + allSynthons[synth] + ) allSynthons[synth].directParents.append(synthOld) synthOld.directChildren.append(allSynthons[synth]) if self.__fragmentationMode == "one_by_one": allSynthons[synth].rIdsToGetIt.append(ind) parentMarksList = [m[0] for m in re.finditer(pat, synthOld.smiles)] kidsMarksList = [m[0] for m in re.finditer(pat, "".join(synthonsProdSet))] - if '[NH2:20]' in kidsMarksList and parentMarksList.count('[NH:20]') == kidsMarksList.count('[NH:20]') + 1: + if ( + "[NH2:20]" in kidsMarksList + and parentMarksList.count("[NH:20]") + == kidsMarksList.count("[NH:20]") + 1 + ): for synthToMark in synthonsProdSet: - if '[NH2:20]' in synthToMark: + if "[NH2:20]" in synthToMark: allSynthons[synthToMark].bivalentN = True for oldSynth in oldComb.participatingSynthon: if oldSynth != synthOld: - allSyntheticPathways[reagSetKey].participatingSynthon.append(oldSynth) + allSyntheticPathways[reagSetKey].participatingSynthon.append( + oldSynth + ) if cutLevel < self.__MaxNumberOfStages: for synth in allSyntheticPathways[reagSetKey].participatingSynthon: - self.__cutOneSynthonHierarchically(allSynthons[synth.smiles], - allSyntheticPathways[reagSetKey], allSynthons, - allSyntheticPathways, cutLevel + 1) - #allSyntheticPathways[reagSetKey].printDetailedReagentsSetInfo() + self.__cutOneSynthonHierarchically( + allSynthons[synth.smiles], + allSyntheticPathways[reagSetKey], + allSynthons, + allSyntheticPathways, + cutLevel + 1, + ) + # allSyntheticPathways[reagSetKey].printDetailedReagentsSetInfo() if self.__fragmentationMode == "one_by_one" and successfulCut: break - def __getMacroCycleLabledSmiles(self, productMolecule: Chem.rdchem.Mol, Labels:list, cutCount): + def __getMacroCycleLabledSmiles(self, productMolecule: Chem.rdchem.Mol, Labels: list, cutCount): productSmiles = Chem.MolToSmiles(productMolecule, canonical=True) for label in Labels: if productSmiles.find(label.split("->")[0]) != -1: labeledSmiles = checkLable(productSmiles, label) if labeledSmiles: if "*" in labeledSmiles: - labeledSmiles = self.__getMacroCycleLabledSmiles(Chem.MolFromSmiles(labeledSmiles), Labels, cutCount) + labeledSmiles = self.__getMacroCycleLabledSmiles( + Chem.MolFromSmiles(labeledSmiles), Labels, cutCount + ) return labeledSmiles print("WARNING! No lable were assigned to the smiles: " + productSmiles) return False - def __getLabledSmiles(self, productMolecule: Chem.rdchem.Mol, Labels:list, cutCount): + def __getLabledSmiles(self, productMolecule: Chem.rdchem.Mol, Labels: list, cutCount): productSmiles = Chem.MolToSmiles(productMolecule, canonical=True) for label in Labels: for sublabel in label.split(","): @@ -1104,6 +1614,7 @@ def __getLabledSmiles(self, productMolecule: Chem.rdchem.Mol, Labels:list, cutCo print("WARNING! No lable were assigned to the smiles: " + productSmiles) return False + def fragmentMolecule(smiles, SyntOnfragmentor, simTh=-1): mol = readMol(smiles) if mol: @@ -1117,14 +1628,34 @@ def fragmentMolecule(smiles, SyntOnfragmentor, simTh=-1): if SyntOnfragmentor.SynthLib != None: firstSynthons = getShortestSyntheticPathways(allSyntheticPathways) for comb in firstSynthons: - comb.checkAvailability(SyntOnfragmentor.SynthLib, simTh, SyntOnfragmentor.FindAnaloguesOfMissingSynthons) + comb.checkAvailability( + SyntOnfragmentor.SynthLib, + simTh, + SyntOnfragmentor.FindAnaloguesOfMissingSynthons, + ) return allSyntheticPathways, allSynthons else: - return None,None + return None, None + -def analoguesLibraryGeneration(Smiles_molNameTuple, SyntOnfragmentor, outDir, simTh=-1, strictAvailabilityMode=False, desiredNumberOfNewMols=1000): - with open(os.path.join(outDir, "SynthonsForAnalogsGenerationForMol" + str(Smiles_molNameTuple[1]) + ".smi"), "w") as outSynthons: - allSyntheticPathways, allSynthons = fragmentMolecule(Smiles_molNameTuple[0], SyntOnfragmentor, simTh=simTh) +def analoguesLibraryGeneration( + Smiles_molNameTuple, + SyntOnfragmentor, + outDir, + simTh=-1, + strictAvailabilityMode=False, + desiredNumberOfNewMols=1000, +): + with open( + os.path.join( + outDir, + "SynthonsForAnalogsGenerationForMol" + str(Smiles_molNameTuple[1]) + ".smi", + ), + "w", + ) as outSynthons: + allSyntheticPathways, allSynthons = fragmentMolecule( + Smiles_molNameTuple[0], SyntOnfragmentor, simTh=simTh + ) """fsynthonsAfterOneCut = getShortestSyntheticPathways(allSyntheticPathways) shortestSynthesis = findShortestSynthPathWithAvailableSynthLib(fsynthonsAfterOneCut, showAll=False, firstLaunch=True)""" @@ -1137,53 +1668,98 @@ def analoguesLibraryGeneration(Smiles_molNameTuple, SyntOnfragmentor, outDir, si CompletePath = True if "MR" in comb.name: continue - synthonsDict, SynthonsForAnaloguesSynthesisLocal = comb.getSynthonsForAnaloguesGeneration(SyntOnfragmentor.SynthLib, - simTh, strictAvailabilityMode = strictAvailabilityMode) + ( + synthonsDict, + SynthonsForAnaloguesSynthesisLocal, + ) = comb.getSynthonsForAnaloguesGeneration( + SyntOnfragmentor.SynthLib, + simTh, + strictAvailabilityMode=strictAvailabilityMode, + ) if synthonsDict and SynthonsForAnaloguesSynthesisLocal: - outSynthons.write("****************************************** " + comb.name + " ******************************************\n") + outSynthons.write( + "****************************************** " + + comb.name + + " ******************************************\n" + ) outSynthons.writelines(SynthonsForAnaloguesSynthesisLocal) - reactionsUsedInFragmentationReactions = [rid.split("_")[0] for rid in comb.name.split("|")] - reactionForReconstruction = SyntOnfragmentor.getReactionForReconstruction(reactionsUsedInFragmentationReactions) - enumerator = enumeration(outDir=outDir, Synthons=synthonsDict, - reactionSMARTS=reactionForReconstruction, maxNumberOfReactedSynthons=len(synthonsDict), - desiredNumberOfNewMols=desiredNumberOfNewMols, nCores=1, analoguesEnumeration=True) - #reconstructedMols.update(enumerator.newAnaloguesGeneration()) + reactionsUsedInFragmentationReactions = [ + rid.split("_")[0] for rid in comb.name.split("|") + ] + reactionForReconstruction = SyntOnfragmentor.getReactionForReconstruction( + reactionsUsedInFragmentationReactions + ) + enumerator = enumeration( + outDir=outDir, + Synthons=synthonsDict, + reactionSMARTS=reactionForReconstruction, + maxNumberOfReactedSynthons=len(synthonsDict), + desiredNumberOfNewMols=desiredNumberOfNewMols, + nCores=1, + analoguesEnumeration=True, + ) + # reconstructedMols.update(enumerator.newAnaloguesGeneration()) reconstructedMols.update(enumerator.AnaloguesGeneration()) - if len(reconstructedMols)>=desiredNumberOfNewMols: + if len(reconstructedMols) >= desiredNumberOfNewMols: break - #print(comb.name) + # print(comb.name) if not CompletePath: synthonsAfterOneCut = getShortestSyntheticPathways(allSyntheticPathways) - shortestSynthesis = findShortestSynthPathWithAvailableBBlib(synthonsAfterOneCut, showAll=False, - firstLaunch=True) + shortestSynthesis = findShortestSynthPathWithAvailableBBlib( + synthonsAfterOneCut, showAll=False, firstLaunch=True + ) for comb in shortestSynthesis: if comb.availabilityRate == 1.0: if "MR" in comb.name: continue - #comb.printDetailedReagentsSetInfo() - synthonsDict, SynthonsForAnaloguesSynthesisLocal = comb.getSynthonsForAnaloguesGeneration( - SyntOnfragmentor.SynthLib, simTh, strictAvailabilityMode=strictAvailabilityMode) - #comb.printDetailedReagentsSetInfo() + # comb.printDetailedReagentsSetInfo() + ( + synthonsDict, + SynthonsForAnaloguesSynthesisLocal, + ) = comb.getSynthonsForAnaloguesGeneration( + SyntOnfragmentor.SynthLib, + simTh, + strictAvailabilityMode=strictAvailabilityMode, + ) + # comb.printDetailedReagentsSetInfo() if synthonsDict and SynthonsForAnaloguesSynthesisLocal: outSynthons.write( - "****************************************** " + comb.name + " ******************************************\n") + "****************************************** " + + comb.name + + " ******************************************\n" + ) outSynthons.writelines(SynthonsForAnaloguesSynthesisLocal) - reactionsUsedInFragmentationReactions = [rid.split("_")[0] for rid in comb.name.split("|")] - reactionForReconstruction = SyntOnfragmentor.getReactionForReconstruction( - reactionsUsedInFragmentationReactions) - enumerator = enumeration(outDir=outDir, Synthons=synthonsDict, - reactionSMARTS=reactionForReconstruction, - maxNumberOfReactedSynthons=len(synthonsDict), - desiredNumberOfNewMols=1000000, nCores=1, - analoguesEnumeration=True) + reactionsUsedInFragmentationReactions = [ + rid.split("_")[0] for rid in comb.name.split("|") + ] + reactionForReconstruction = ( + SyntOnfragmentor.getReactionForReconstruction( + reactionsUsedInFragmentationReactions + ) + ) + enumerator = enumeration( + outDir=outDir, + Synthons=synthonsDict, + reactionSMARTS=reactionForReconstruction, + maxNumberOfReactedSynthons=len(synthonsDict), + desiredNumberOfNewMols=1000000, + nCores=1, + analoguesEnumeration=True, + ) # reconstructedMols.update(enumerator.newAnaloguesGeneration()) reconstructedMols.update(enumerator.AnaloguesGeneration()) - with open(os.path.join(outDir, "AnalogsForMol" + str(Smiles_molNameTuple[1]) + ".smi"), "w") as out: + with open( + os.path.join(outDir, "AnalogsForMol" + str(Smiles_molNameTuple[1]) + ".smi"), + "w", + ) as out: for recMolsSmiles in reconstructedMols: out.write(recMolsSmiles + "\n") -def findShortestSynthPathWithAvailableBBlib(firstSynthons:list, allCombSynthons = None, firstLaunch=True, showAll=True): + +def findShortestSynthPathWithAvailableBBlib( + firstSynthons: list, allCombSynthons=None, firstLaunch=True, showAll=True +): max_rec = 0x100000 # May segfault without this line. 0x100 is a guess at the size of each stack frame. resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY]) @@ -1194,9 +1770,11 @@ def findShortestSynthPathWithAvailableBBlib(firstSynthons:list, allCombSynthons for comb in firstSynthons: allCombSynthons.append(comb) if comb.directChildrenSynthPathways: - findShortestSynthPathWithAvailableBBlib(comb.directChildrenSynthPathways, allCombSynthons, firstLaunch=False) + findShortestSynthPathWithAvailableBBlib( + comb.directChildrenSynthPathways, allCombSynthons, firstLaunch=False + ) if firstLaunch: - #allCombSynthons.sort(key=lambda x: (x.availabilityRate, x.reagentsNumber*-1), reverse=True) + # allCombSynthons.sort(key=lambda x: (x.availabilityRate, x.reagentsNumber*-1), reverse=True) allCombSynthons.sort(key=lambda x: (x.availabilityRate, x.reagentsNumber), reverse=True) rate = allCombSynthons[0].availabilityRate reagents = allCombSynthons[0].reagentsNumber @@ -1208,17 +1786,23 @@ def findShortestSynthPathWithAvailableBBlib(firstSynthons:list, allCombSynthons for comb in allCombSynthons: if comb.availabilityRate == rate and comb.reagentsNumber == reagents: for savedComb in shortestAvailablePathways: - if set([x.smiles for x in comb.participatingSynthon]) == set([x.smiles for x in savedComb.participatingSynthon]): + if set([x.smiles for x in comb.participatingSynthon]) == set( + [x.smiles for x in savedComb.participatingSynthon] + ): add = False break if add: shortestAvailablePathways.append(comb) - #else: - #savedComb.synthPathwayReactions.extend(comb.synthPathwayReactions) - #savedComb.synthPathwayReactions = list(set(savedComb.synthPathwayReactions)) - print(str(len(shortestAvailablePathways)) + " equivalent synthetic pathway(s) have been found.") + # else: + # savedComb.synthPathwayReactions.extend(comb.synthPathwayReactions) + # savedComb.synthPathwayReactions = list(set(savedComb.synthPathwayReactions)) + print( + str(len(shortestAvailablePathways)) + + " equivalent synthetic pathway(s) have been found." + ) return shortestAvailablePathways + def getShortestSyntheticPathways(allSyntheticPathways): firstSynthons = [] for comb in allSyntheticPathways: @@ -1226,12 +1810,13 @@ def getShortestSyntheticPathways(allSyntheticPathways): firstSynthons.append(allSyntheticPathways[comb]) return firstSynthons + def getLongestSyntheticPathways(allSyntheticPathways): leafs = [] for comb in allSyntheticPathways: if not allSyntheticPathways[comb].directChildrenSynthPathways and comb != "zeroCombin": - #print(".................----------------------------------........................") - #print(str(allSyntheticPathways[comb].name) + " " + ".".join([x.smiles for x in allSyntheticPathways[comb].participatingSynthon])) + # print(".................----------------------------------........................") + # print(str(allSyntheticPathways[comb].name) + " " + ".".join([x.smiles for x in allSyntheticPathways[comb].participatingSynthon])) leafs.append(allSyntheticPathways[comb]) return leafs diff --git a/src/SyntOn_BBs.py b/src/SyntOn_BBs.py index 08c17e0..000e90c 100644 --- a/src/SyntOn_BBs.py +++ b/src/SyntOn_BBs.py @@ -1,26 +1,47 @@ -import re,os,sys +import os +import re +import sys import xml.etree.ElementTree as ET + from rdkit import Chem from rdkit.Chem import rdChemReactions as Reactions from rdkit.Chem.Scaffolds.MurckoScaffold import * + srcPath = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(1, srcPath) from SyntOn_Classifier import BBClassifier from UsefulFunctions import * -def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, returnBoolAndDict=False): - solventsToIgnore = ["OC(=O)C(=O)O", "CC(=O)O", "OS(=O)(=O)O", "[O-]Cl(=O)(=O)=O", "OP(=O)(O)O", "OC(=O)C(F)(F)F", - "OS(=O)(=O)C(F)(F)F", "OC(=O)O", "[O-]S(=O)(=O)C(F)(F)F", "OC=O", "OC(=O)/C=C\C(=O)O", "[O-]C(=O)C(F)(F)F", - "OC(=O)/C=C/C(=O)O"] - canonicalSmilesOfSolventsToIgnore = set([Chem.MolToSmiles(Chem.MolFromSmiles(x), canonical=True) for x in solventsToIgnore]) +def mainSynthonsGenerator( + initSmi, keepPG=False, Classes=None, returnDict=False, returnBoolAndDict=False +): + solventsToIgnore = [ + "OC(=O)C(=O)O", + "CC(=O)O", + "OS(=O)(=O)O", + "[O-]Cl(=O)(=O)=O", + "OP(=O)(O)O", + "OC(=O)C(F)(F)F", + "OS(=O)(=O)C(F)(F)F", + "OC(=O)O", + "[O-]S(=O)(=O)C(F)(F)F", + "OC=O", + "OC(=O)/C=C\C(=O)O", + "[O-]C(=O)C(F)(F)F", + "OC(=O)/C=C/C(=O)O", + ] + canonicalSmilesOfSolventsToIgnore = set( + [Chem.MolToSmiles(Chem.MolFromSmiles(x), canonical=True) for x in solventsToIgnore] + ) initMol = readMol(initSmi) query = Chem.MolFromSmarts( - "[#6]-[#6]-[#8]-[#6].[#6]-[#8]-[#6](-[#6])=O.[#6]-[#8]-[#6](-[#6])=O.[#6]-[#8]-[#6](-[#6])=O") + "[#6]-[#6]-[#8]-[#6].[#6]-[#8]-[#6](-[#6])=O.[#6]-[#8]-[#6](-[#6])=O.[#6]-[#8]-[#6](-[#6])=O" + ) if initMol == None or initMol.HasSubstructMatch(query): finalSynthon = {} azoles = False - elif len(initSmi.split(".")) > 1: # case of input mixtures + elif len(initSmi.split(".")) > 1: # case of input mixtures finalSynthon = {} azoles = False for smi in initSmi.split("."): @@ -36,8 +57,14 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, else: if Classes == None: AllClasses = BBClassifier(mol=initMol) - Classes = [clas for clas in AllClasses if "MedChemHighlights" not in clas and "DEL" not in clas] - BBmarks = os.path.join(os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], "config" , "BB_Marks.xml") + Classes = [ + clas for clas in AllClasses if "MedChemHighlights" not in clas and "DEL" not in clas + ] + BBmarks = os.path.join( + os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], + "config", + "BB_Marks.xml", + ) tree = ET.parse(BBmarks) BB_Marks = tree.getroot() MarksSetup = __getReactionSMARTS(BB_Marks) @@ -53,7 +80,13 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, if "Bifunctional" in Cls or "Trifunctional" in Cls: polyfunc = True polyfuncName.append(Cls) - if not ("Nboc" in Cls or "Ncbz" in Cls or "Nfmoc" in Cls or "Ester" in Cls or "TFAc" in Cls): + if not ( + "Nboc" in Cls + or "Ncbz" in Cls + or "Nfmoc" in Cls + or "Ester" in Cls + or "TFAc" in Cls + ): keepSynthonsWithPG = True break while ind < len(Classes): @@ -65,14 +98,23 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, if polyfunc and "Bifunctional" not in Classes[ind]: ignoreThisClass = False for subName in [y for x in polyfuncName for y in x.split("_")]: - if subName in Classes[ind] and "Bifunctional_NbnDi_Amines" not in polyfuncName: + if ( + subName in Classes[ind] + and "Bifunctional_NbnDi_Amines" not in polyfuncName + ): ignoreThisClass = True ind += 1 break if ignoreThisClass: continue for mol in molsToWorkWith: - synthons = __synthonsAssignement(Classes[ind], molsToWorkWith[mol], mol, MarksSetup, keepSynthonsWithPG) + synthons = __synthonsAssignement( + Classes[ind], + molsToWorkWith[mol], + mol, + MarksSetup, + keepSynthonsWithPG, + ) if synthons: for synth in synthons: if synth not in synthonsAfterMonofuncClasses: @@ -95,10 +137,16 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, finalSynthon[synth].update(synthonsAfterMonofuncClasses[synth]) if polyfunc: for i, ind in enumerate(polyfuncInd): - if i < len(polyfuncInd) - 1 : + if i < len(polyfuncInd) - 1: extraMols = {} for mol in molsToWorkWith: - synthons = __synthonsAssignement(Classes[ind], molsToWorkWith[mol], mol, MarksSetup, keepSynthonsWithPG) + synthons = __synthonsAssignement( + Classes[ind], + molsToWorkWith[mol], + mol, + MarksSetup, + keepSynthonsWithPG, + ) if synthons: for synth in synthons: if synth not in finalSynthon: @@ -145,9 +193,9 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, if not finalSynthon and "Esters_Esters" in Classes: ReactionLIST = "[C;$(C(=O)[#6]):1][O:2]>>*[C;+0:1]|[O;!R;$(O(C(=O)[#6])[CX4,c]):1][C;$(C(=O)):2]>>*[O;+0:1]" LabelList = "*C->C:10,*[13C]->13C:10,*[13CH]->13C:10|*O->O:20" - finalSynthon = __NormalSynthonsGenerator(LabelList, ReactionLIST, - set(), "Esters_Esters", initMol, - func=1) + finalSynthon = __NormalSynthonsGenerator( + LabelList, ReactionLIST, set(), "Esters_Esters", initMol, func=1 + ) if "Ketones_Ketones" in Classes: newSynthToAdd = {} for synthon in finalSynthon: @@ -156,7 +204,9 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, newClasses = BBClassifier(mol=synthMol) for cls in newClasses: if "Alcohols" in cls: - nAzoles, nFinalSynthon = mainSynthonsGenerator( synthon, keepPG, [cls], returnBoolAndDict=True) + nAzoles, nFinalSynthon = mainSynthonsGenerator( + synthon, keepPG, [cls], returnBoolAndDict=True + ) for newSynth in nFinalSynthon: if newSynth not in finalSynthon and newSynth not in newSynthToAdd: newSynthToAdd[newSynth] = set() @@ -170,47 +220,95 @@ def mainSynthonsGenerator(initSmi, keepPG=False, Classes=None, returnDict=False, elif returnBoolAndDict: return azoles, finalSynthon else: - print("\n\n\n___________________________________________________________________________________________________") - print("All generated synthons (" + str(len(finalSynthon)) +"): " + ".".join([x for x in finalSynthon])) + print( + "\n\n\n___________________________________________________________________________________________________" + ) + print( + "All generated synthons (" + + str(len(finalSynthon)) + + "): " + + ".".join([x for x in finalSynthon]) + ) print("Col1-Synton Col2-RespectiveBBsClass") for synth in finalSynthon: print(synth + " " + "+".join(finalSynthon[synth])) -def __synthonsAssignement(CurrentClass, PreviousClasses, molSmi, MarksSetup, keepSynthonsWithPG=True): - additionalBifuncClasses = ["Aminoacids_N-AliphaticAmino_Acid", "Aminoacids_N-AromaticAmino_Acid", "Reagents_DiAmines"] - - PGBifunctional = ["Bifunctional_Acid_Ester","Bifunctional_Acid_Nitro", "Bifunctional_Aldehyde_Ester", "Bifunctional_Amine_Ester", - "Bifunctional_Ester_Isocyanates", "Bifunctional_Ester_SO2X", "Bifunctional_Aldehyde_Nitro", - - "Bifunctional_NbocAmino_Acid", "Bifunctional_NcbzAmino_Acid", "Bifunctional_Isothiocyanates_Acid", "Bifunctional_NfmocAmino_Acid", - "Bifunctional_Aldehyde_Nboc", "Bifunctional_NTFAcAmino_Acid", - - "Bifunctional_Boronics_Ncbz", "Bifunctional_Boronics_Nfmoc", - - "Bifunctional_NbnDi_Amines", "Bifunctional_NbocDi_Amines", 'Bifunctional_NcbzDi_Amines', "Bifunctional_NfmocDi_Amines", - "Bifunctional_NTFAcDi_Amines", 'Bifunctional_Di_Amines_NotherCarbamates', - 'Trifunctional_Acid_Aldehyde_Nitro', "Trifunctional_Acid_ArylHalide_Ester", - "Trifunctional_Acid_ArylHalide_Nitro", 'Trifunctional_Amines_ArylHalide_Nitro', - "Trifunctional_NbocAmino_Acid_AlkyneCH", "Trifunctional_NbocAmino_Acid_ArylHalide", - "Trifunctional_NfmocAmino_Acid_AlkyneCH", "Trifunctional_NfmocAmino_Acid_ArylHalide"] - - __FirstReactionAsPreparation = ["Bifunctional_Acid_Aldehyde", "Bifunctional_Aldehyde_ArylHalide", - "Bifunctional_Aldehyde_SO2X", "Bifunctional_Boronics_Acid", - "Bifunctional_Boronics_Aldehyde", "Bifunctional_Hydroxy_Aldehyde", - - 'Trifunctional_Acid_Aldehyde_ArylHalide', "Trifunctional_Acid_Aldehyde_Acetylenes", - 'Trifunctional_Acid_Aldehyde_Nitro', 'Trifunctional_Amines_ArylHalide_Nitro', - "Trifunctional_NbocAmino_Acid_AlkyneCH", "Trifunctional_NfmocAmino_Acid_AlkyneCH", - "Trifunctional_Di_Esters_Amino"] - - PolymerReagents = ["Reagents_PoliOxiranes", "Esters_PoliEsters", "Reagents_PoliIsocyanates", "SulfonylHalides_Poli_Sulfonylhalides"] - - trifuncClassesWithTwoPGs = ['Trifunctional_Acid_Ester_Nitro', "Trifunctional_NbocAmino_Acid_Ester", - "Trifunctional_NbocAmino_Acid_Nitro", "Trifunctional_Amines_Nboc_Ester", - "Trifunctional_Nboc_NCbz_Amino_Acid", "Trifunctional_Nboc_Nfmoc_Amino_Acid", - "Trifunctional_NfmocAmino_Acid_Ester", "Trifunctional_NfmocAmino_Acid_Nitro", - "Trifunctional_Di_Esters_Amino"] +def __synthonsAssignement( + CurrentClass, PreviousClasses, molSmi, MarksSetup, keepSynthonsWithPG=True +): + additionalBifuncClasses = [ + "Aminoacids_N-AliphaticAmino_Acid", + "Aminoacids_N-AromaticAmino_Acid", + "Reagents_DiAmines", + ] + + PGBifunctional = [ + "Bifunctional_Acid_Ester", + "Bifunctional_Acid_Nitro", + "Bifunctional_Aldehyde_Ester", + "Bifunctional_Amine_Ester", + "Bifunctional_Ester_Isocyanates", + "Bifunctional_Ester_SO2X", + "Bifunctional_Aldehyde_Nitro", + "Bifunctional_NbocAmino_Acid", + "Bifunctional_NcbzAmino_Acid", + "Bifunctional_Isothiocyanates_Acid", + "Bifunctional_NfmocAmino_Acid", + "Bifunctional_Aldehyde_Nboc", + "Bifunctional_NTFAcAmino_Acid", + "Bifunctional_Boronics_Ncbz", + "Bifunctional_Boronics_Nfmoc", + "Bifunctional_NbnDi_Amines", + "Bifunctional_NbocDi_Amines", + "Bifunctional_NcbzDi_Amines", + "Bifunctional_NfmocDi_Amines", + "Bifunctional_NTFAcDi_Amines", + "Bifunctional_Di_Amines_NotherCarbamates", + "Trifunctional_Acid_Aldehyde_Nitro", + "Trifunctional_Acid_ArylHalide_Ester", + "Trifunctional_Acid_ArylHalide_Nitro", + "Trifunctional_Amines_ArylHalide_Nitro", + "Trifunctional_NbocAmino_Acid_AlkyneCH", + "Trifunctional_NbocAmino_Acid_ArylHalide", + "Trifunctional_NfmocAmino_Acid_AlkyneCH", + "Trifunctional_NfmocAmino_Acid_ArylHalide", + ] + + __FirstReactionAsPreparation = [ + "Bifunctional_Acid_Aldehyde", + "Bifunctional_Aldehyde_ArylHalide", + "Bifunctional_Aldehyde_SO2X", + "Bifunctional_Boronics_Acid", + "Bifunctional_Boronics_Aldehyde", + "Bifunctional_Hydroxy_Aldehyde", + "Trifunctional_Acid_Aldehyde_ArylHalide", + "Trifunctional_Acid_Aldehyde_Acetylenes", + "Trifunctional_Acid_Aldehyde_Nitro", + "Trifunctional_Amines_ArylHalide_Nitro", + "Trifunctional_NbocAmino_Acid_AlkyneCH", + "Trifunctional_NfmocAmino_Acid_AlkyneCH", + "Trifunctional_Di_Esters_Amino", + ] + + PolymerReagents = [ + "Reagents_PoliOxiranes", + "Esters_PoliEsters", + "Reagents_PoliIsocyanates", + "SulfonylHalides_Poli_Sulfonylhalides", + ] + + trifuncClassesWithTwoPGs = [ + "Trifunctional_Acid_Ester_Nitro", + "Trifunctional_NbocAmino_Acid_Ester", + "Trifunctional_NbocAmino_Acid_Nitro", + "Trifunctional_Amines_Nboc_Ester", + "Trifunctional_Nboc_NCbz_Amino_Acid", + "Trifunctional_Nboc_Nfmoc_Amino_Acid", + "Trifunctional_NfmocAmino_Acid_Ester", + "Trifunctional_NfmocAmino_Acid_Nitro", + "Trifunctional_Di_Esters_Amino", + ] if CurrentClass in trifuncClassesWithTwoPGs or "Trifunctional" in CurrentClass: func = 3 @@ -221,18 +319,22 @@ def __synthonsAssignement(CurrentClass, PreviousClasses, molSmi, MarksSetup, kee if CurrentClass in __FirstReactionAsPreparation: firstReactionAsPrep = True else: - firstReactionAsPrep=False + firstReactionAsPrep = False if CurrentClass in trifuncClassesWithTwoPGs: - twoPGs=True + twoPGs = True else: - twoPGs=False + twoPGs = False labledSynthons = {} mol = readMol(molSmi) if CurrentClass in PolymerReagents: MolsToWorkWith = {Chem.MolToSmiles(mol, canonical=True): PreviousClasses} - for i in range(len(MarksSetup[CurrentClass]['Labels'].split("|"))): - synthons = __SynthonsGeneratorsForPolymerReagents(MarksSetup[CurrentClass]['Labels'].split("|")[i], - MarksSetup[CurrentClass]['SMARTS'].split("|")[i], CurrentClass, MolsToWorkWith) + for i in range(len(MarksSetup[CurrentClass]["Labels"].split("|"))): + synthons = __SynthonsGeneratorsForPolymerReagents( + MarksSetup[CurrentClass]["Labels"].split("|")[i], + MarksSetup[CurrentClass]["SMARTS"].split("|")[i], + CurrentClass, + MolsToWorkWith, + ) if synthons: for synth in synthons: if synth not in labledSynthons: @@ -241,17 +343,36 @@ def __synthonsAssignement(CurrentClass, PreviousClasses, molSmi, MarksSetup, kee labledSynthons[synth].update(synthons[synth]) return labledSynthons elif CurrentClass in PGBifunctional or twoPGs: - synthons = __ProtectiveGroupRemoval(MarksSetup[CurrentClass]['Labels'], MarksSetup[CurrentClass]['SMARTS'], - mol, keepSynthonsWithPG, - firstReactionAsPrep, func, PreviousClasses, CurrentClass, twoPGs) + synthons = __ProtectiveGroupRemoval( + MarksSetup[CurrentClass]["Labels"], + MarksSetup[CurrentClass]["SMARTS"], + mol, + keepSynthonsWithPG, + firstReactionAsPrep, + func, + PreviousClasses, + CurrentClass, + twoPGs, + ) elif firstReactionAsPrep: - synthons = __FirstReactionAsPrep(MarksSetup[CurrentClass]['Labels'], MarksSetup[CurrentClass]['SMARTS'], - PreviousClasses, CurrentClass, mol, func) + synthons = __FirstReactionAsPrep( + MarksSetup[CurrentClass]["Labels"], + MarksSetup[CurrentClass]["SMARTS"], + PreviousClasses, + CurrentClass, + mol, + func, + ) else: - synthons = __NormalSynthonsGenerator(MarksSetup[CurrentClass]['Labels'], MarksSetup[CurrentClass]['SMARTS'], - PreviousClasses, CurrentClass, mol, - func=func) + synthons = __NormalSynthonsGenerator( + MarksSetup[CurrentClass]["Labels"], + MarksSetup[CurrentClass]["SMARTS"], + PreviousClasses, + CurrentClass, + mol, + func=func, + ) if synthons: for synth in synthons: if synth not in labledSynthons: @@ -261,21 +382,27 @@ def __synthonsAssignement(CurrentClass, PreviousClasses, molSmi, MarksSetup, kee return labledSynthons + def __azolesSynthonPostGeneration(labledSynthons): pat = re.compile("\[\w*:\w*\]") Class = "nHAzoles_nHAzoles" additionalSynthons = {} maxMark = 0 for molSmiles in labledSynthons: - marksPrevious = [molSmiles[m.start():m.start() + 2] + molSmiles[m.end() - 4:m.end()] for m in - re.finditer(pat, molSmiles)] - if len(marksPrevious)>maxMark: - maxMark=len(marksPrevious) + marksPrevious = [ + molSmiles[m.start() : m.start() + 2] + molSmiles[m.end() - 4 : m.end()] + for m in re.finditer(pat, molSmiles) + ] + if len(marksPrevious) > maxMark: + maxMark = len(marksPrevious) for molSmiles in labledSynthons: query = Chem.MolFromSmarts("[nHr5;!$(nc=O)]") mol = readMol(molSmiles) - marksPrevious = [molSmiles[m.start():m.start() + 2] + molSmiles[m.end() - 4:m.end()] for m in re.finditer(pat, molSmiles)] - if len(marksPrevious)==maxMark and mol.HasSubstructMatch(query): + marksPrevious = [ + molSmiles[m.start() : m.start() + 2] + molSmiles[m.end() - 4 : m.end()] + for m in re.finditer(pat, molSmiles) + ] + if len(marksPrevious) == maxMark and mol.HasSubstructMatch(query): cuttingRule = Reactions.ReactionFromSmarts("[nH;r5:1]>>*[n:1]") label = "*n->n:20" products = cuttingRule.RunReactants((mol,)) @@ -284,9 +411,11 @@ def __azolesSynthonPostGeneration(labledSynthons): labledSynthon = __getBBLabledSmiles(product, label) if marksPrevious: for synth in labledSynthon: - marksNew = [synth[m.start():m.start() + 2] + synth[m.end() - 4:m.end()] for m in - re.finditer(pat, synth)] - if len(marksNew)>len(marksPrevious): + marksNew = [ + synth[m.start() : m.start() + 2] + synth[m.end() - 4 : m.end()] + for m in re.finditer(pat, synth) + ] + if len(marksNew) > len(marksPrevious): if synth not in additionalSynthons: additionalSynthons[synth] = labledSynthons[molSmiles].copy() else: @@ -301,17 +430,23 @@ def __azolesSynthonPostGeneration(labledSynthons): additionalSynthons[synth].add(Class) return additionalSynthons + def __generateBiacideSynthonForTrifunctional(labledSynthons, Class): additionalSynthons = {} for molSmiles in labledSynthons: query = Chem.MolFromSmarts( - "[O;$(O=C([#6])[OD1])].[O;$(O([CH3])C([#6])=O),$(O([CH2][CH3])C([#6])=O),$(O([CH2]c1[cH][cH][cH][cH][cH]1)C([#6])=O),$(O(C([CH3])([CH3])[CH3])C([#6])=O),$(O([CH2][CH]=[CH2])C([#6])=O)]") + "[O;$(O=C([#6])[OD1])].[O;$(O([CH3])C([#6])=O),$(O([CH2][CH3])C([#6])=O),$(O([CH2]c1[cH][cH][cH][cH][cH]1)C([#6])=O),$(O(C([CH3])([CH3])[CH3])C([#6])=O),$(O([CH2][CH]=[CH2])C([#6])=O)]" + ) mol = readMol(molSmiles) pat = re.compile("\[\w*:\w*\]") - marksPrevious = [molSmiles[m.start():m.start() + 2] + molSmiles[m.end() - 4:m.end()] for m in re.finditer(pat, molSmiles)] + marksPrevious = [ + molSmiles[m.start() : m.start() + 2] + molSmiles[m.end() - 4 : m.end()] + for m in re.finditer(pat, molSmiles) + ] if mol.HasSubstructMatch(query): cuttingRule = Reactions.ReactionFromSmarts( - "[O;$(O(C)C([#6])=O):1][C;$([CH3]),$([CH2][CH3]),$([CH2]c1[cH][cH][cH][cH][cH]1),$(C([CH3])([CH3])[CH3]),$([CH2][CH]=[CH2]):2]>>[OH:1]") + "[O;$(O(C)C([#6])=O):1][C;$([CH3]),$([CH2][CH3]),$([CH2]c1[cH][cH][cH][cH][cH]1),$(C([CH3])([CH3])[CH3]),$([CH2][CH]=[CH2]):2]>>[OH:1]" + ) label = "No" products = cuttingRule.RunReactants((mol,)) for productSet in products: @@ -319,9 +454,11 @@ def __generateBiacideSynthonForTrifunctional(labledSynthons, Class): labledSynthon = __getBBLabledSmiles(product, label) if marksPrevious: for synth in labledSynthon: - marksNew = [synth[m.start():m.start() + 2] + synth[m.end() - 4:m.end()] for m in - re.finditer(pat, synth)] - if len(marksNew)>len(marksPrevious): + marksNew = [ + synth[m.start() : m.start() + 2] + synth[m.end() - 4 : m.end()] + for m in re.finditer(pat, synth) + ] + if len(marksNew) > len(marksPrevious): if synth not in additionalSynthons: additionalSynthons[synth] = labledSynthons[molSmiles].copy() else: @@ -336,14 +473,26 @@ def __generateBiacideSynthonForTrifunctional(labledSynthons, Class): additionalSynthons[synth].add(Class) return additionalSynthons -def __SynthonsGeneratorsForPolymerReagents(Label, rule, Class, MolsToWorkWith, finalSynthons=None, firstLaunch=True, Deprotection=False): - if finalSynthons==None: + +def __SynthonsGeneratorsForPolymerReagents( + Label, + rule, + Class, + MolsToWorkWith, + finalSynthons=None, + firstLaunch=True, + Deprotection=False, +): + if finalSynthons == None: finalSynthons = {} newMolsToWorkWith = {} cuttingRule = Reactions.ReactionFromSmarts(rule) for molSmiles in MolsToWorkWith: pat = re.compile("\[\w*:\w*\]") - marksPrevious = [molSmiles[m.start():m.start() + 2] + molSmiles[m.end() - 4:m.end()] for m in re.finditer(pat, molSmiles)] + marksPrevious = [ + molSmiles[m.start() : m.start() + 2] + molSmiles[m.end() - 4 : m.end()] + for m in re.finditer(pat, molSmiles) + ] products = cuttingRule.RunReactants((Chem.MolFromSmiles(molSmiles),)) if not products and not firstLaunch: if molSmiles not in finalSynthons: @@ -356,8 +505,10 @@ def __SynthonsGeneratorsForPolymerReagents(Label, rule, Class, MolsToWorkWith, f labledSynthons = __getBBLabledSmiles(product, Label) if marksPrevious and not Deprotection: for synth in labledSynthons: - marksNew = [synth[m.start():m.start() + 2] + synth[m.end() - 4:m.end()] for m in - re.finditer(pat, synth)] + marksNew = [ + synth[m.start() : m.start() + 2] + synth[m.end() - 4 : m.end()] + for m in re.finditer(pat, synth) + ] if len(marksNew) > len(marksPrevious): if synth not in newMolsToWorkWith: newMolsToWorkWith[synth] = MolsToWorkWith[molSmiles].copy() @@ -372,41 +523,86 @@ def __SynthonsGeneratorsForPolymerReagents(Label, rule, Class, MolsToWorkWith, f newMolsToWorkWith[synth].update(MolsToWorkWith[molSmiles]) newMolsToWorkWith[synth].add(Class) if newMolsToWorkWith: - __SynthonsGeneratorsForPolymerReagents(Label, rule, Class, newMolsToWorkWith, finalSynthons, firstLaunch=False, - Deprotection=Deprotection) + __SynthonsGeneratorsForPolymerReagents( + Label, + rule, + Class, + newMolsToWorkWith, + finalSynthons, + firstLaunch=False, + Deprotection=Deprotection, + ) if firstLaunch: return finalSynthons -def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, firstReactionAsPrep, func, PreviousClasses, - CurrentClass, twoPGs=False): + +def __ProtectiveGroupRemoval( + LabelsLIST, + ReactionLIST, + mol, + keepSynthonsWithPG, + firstReactionAsPrep, + func, + PreviousClasses, + CurrentClass, + twoPGs=False, +): LabelsLISTBeforePGRemoval = LabelsLIST.split("|No|")[0] - ReactionLISTBeforePGRemoval = ReactionLIST.split("|")[:len(LabelsLISTBeforePGRemoval.split("|"))] + ReactionLISTBeforePGRemoval = ReactionLIST.split("|")[ + : len(LabelsLISTBeforePGRemoval.split("|")) + ] firstStopInd = len(LabelsLISTBeforePGRemoval.split("|")) finalSynthons = {} if firstReactionAsPrep: - synthonsBeforeFirstPGremoval = __FirstReactionAsPrep(LabelsLISTBeforePGRemoval, "|".join(ReactionLISTBeforePGRemoval), - PreviousClasses, CurrentClass, mol, func) + synthonsBeforeFirstPGremoval = __FirstReactionAsPrep( + LabelsLISTBeforePGRemoval, + "|".join(ReactionLISTBeforePGRemoval), + PreviousClasses, + CurrentClass, + mol, + func, + ) else: - if "Ester" in CurrentClass and "Acid" in CurrentClass and func==3 and not twoPGs: - synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator(LabelsLISTBeforePGRemoval, - "|".join(ReactionLISTBeforePGRemoval), - PreviousClasses, CurrentClass, mol, func=3) - elif (func==3 and twoPGs) or func==2: - - synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator(LabelsLISTBeforePGRemoval, "|".join(ReactionLISTBeforePGRemoval), - PreviousClasses, CurrentClass, mol, func=1) - - elif func==3: - synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator(LabelsLISTBeforePGRemoval, - "|".join(ReactionLISTBeforePGRemoval), - PreviousClasses, CurrentClass, mol, func=2) + if "Ester" in CurrentClass and "Acid" in CurrentClass and func == 3 and not twoPGs: + synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator( + LabelsLISTBeforePGRemoval, + "|".join(ReactionLISTBeforePGRemoval), + PreviousClasses, + CurrentClass, + mol, + func=3, + ) + elif (func == 3 and twoPGs) or func == 2: + + synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator( + LabelsLISTBeforePGRemoval, + "|".join(ReactionLISTBeforePGRemoval), + PreviousClasses, + CurrentClass, + mol, + func=1, + ) + + elif func == 3: + synthonsBeforeFirstPGremoval = __NormalSynthonsGenerator( + LabelsLISTBeforePGRemoval, + "|".join(ReactionLISTBeforePGRemoval), + PreviousClasses, + CurrentClass, + mol, + func=2, + ) if CurrentClass == "Trifunctional_Di_Esters_Amino": - SynthonsWithoutPG = __SynthonsGeneratorsForPolymerReagents(LabelsLIST.split("|")[2], - ReactionLIST.split("|")[2], CurrentClass, - synthonsBeforeFirstPGremoval, Deprotection=True) + SynthonsWithoutPG = __SynthonsGeneratorsForPolymerReagents( + LabelsLIST.split("|")[2], + ReactionLIST.split("|")[2], + CurrentClass, + synthonsBeforeFirstPGremoval, + Deprotection=True, + ) if SynthonsWithoutPG: for synth in SynthonsWithoutPG: @@ -414,9 +610,12 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, finalSynthons[synth] = SynthonsWithoutPG[synth].copy() else: finalSynthons[synth].update(SynthonsWithoutPG[synth]) - lastSynthons = __SynthonsGeneratorsForPolymerReagents(LabelsLIST.split("|")[3], - ReactionLIST.split("|")[3], - CurrentClass, SynthonsWithoutPG) + lastSynthons = __SynthonsGeneratorsForPolymerReagents( + LabelsLIST.split("|")[3], + ReactionLIST.split("|")[3], + CurrentClass, + SynthonsWithoutPG, + ) if lastSynthons: for synth in lastSynthons: if synth not in finalSynthons: @@ -425,7 +624,11 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, finalSynthons[synth].update(lastSynthons[synth]) return finalSynthons PGremovalRule = ReactionLIST.split("|")[firstStopInd] - if keepSynthonsWithPG or PGremovalRule =="[N;+0,+1;$([N+](=O)([#6])[O-]),$(N(=O)([#6])=O):1](=[O:2])=,-[O;+0,-1:3]>>[NH2,+0:1]": + if ( + keepSynthonsWithPG + or PGremovalRule + == "[N;+0,+1;$([N+](=O)([#6])[O-]),$(N(=O)([#6])=O):1](=[O:2])=,-[O;+0,-1:3]>>[NH2,+0:1]" + ): for synth in synthonsBeforeFirstPGremoval: if synth not in finalSynthons: finalSynthons[synth] = synthonsBeforeFirstPGremoval[synth].copy() @@ -435,7 +638,13 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, PGlable = LabelsLIST.split("|")[firstStopInd] SynthonsWithoutPG = {} for smi in synthonsBeforeFirstPGremoval: - if func==3 and not twoPGs and smi.count(":")<2 and "Ester" not in CurrentClass and "AlkyneCH" not in CurrentClass: #this force algorythm to use for the PG removal step only synthons where all unprotected functional groups has been transformed into synthons + if ( + func == 3 + and not twoPGs + and smi.count(":") < 2 + and "Ester" not in CurrentClass + and "AlkyneCH" not in CurrentClass + ): # this force algorythm to use for the PG removal step only synthons where all unprotected functional groups has been transformed into synthons continue products = cuttingRule.RunReactants((Chem.MolFromSmiles(smi),)) for productSet in products: @@ -447,24 +656,41 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, SynthonsWithoutPG[synth].update(PreviousClasses) SynthonsWithoutPG[synth].add(CurrentClass) LabelsLISTBetweenPGRemoval = LabelsLIST.split("|No|")[1] - ReactionLISTBetweenPGRemoval = ReactionLIST.split("|")[len(LabelsLISTBeforePGRemoval.split("|")) + 1:len( - LabelsLISTBeforePGRemoval.split("|")) + len(LabelsLISTBetweenPGRemoval.split("|")) + 1] + ReactionLISTBetweenPGRemoval = ReactionLIST.split("|")[ + len(LabelsLISTBeforePGRemoval.split("|")) + + 1 : len(LabelsLISTBeforePGRemoval.split("|")) + + len(LabelsLISTBetweenPGRemoval.split("|")) + + 1 + ] synthonsBetweenPGremoval = SynthonsWithoutPG.copy() for newSynthon in SynthonsWithoutPG: - newSynthonsBetweenPGremoval = __NormalSynthonsGenerator(LabelsLISTBetweenPGRemoval, - "|".join(ReactionLISTBetweenPGRemoval), - PreviousClasses, CurrentClass, Chem.MolFromSmiles(newSynthon), func=1) + newSynthonsBetweenPGremoval = __NormalSynthonsGenerator( + LabelsLISTBetweenPGRemoval, + "|".join(ReactionLISTBetweenPGRemoval), + PreviousClasses, + CurrentClass, + Chem.MolFromSmiles(newSynthon), + func=1, + ) if newSynthonsBetweenPGremoval: for synth in newSynthonsBetweenPGremoval: if synth not in synthonsBetweenPGremoval: synthonsBetweenPGremoval[synth] = newSynthonsBetweenPGremoval[synth].copy() else: synthonsBetweenPGremoval[synth].update(newSynthonsBetweenPGremoval[synth]) - if len(LabelsLIST.split("|No|"))==3: - secondStopInd = len(LabelsLISTBeforePGRemoval.split("|")) + len(LabelsLISTBetweenPGRemoval.split("|")) + 1 + if len(LabelsLIST.split("|No|")) == 3: + secondStopInd = ( + len(LabelsLISTBeforePGRemoval.split("|")) + + len(LabelsLISTBetweenPGRemoval.split("|")) + + 1 + ) PGremovalRule = ReactionLIST.split("|")[secondStopInd] - if len(LabelsLIST.split("|No|"))==2 or keepSynthonsWithPG or \ - PGremovalRule =="[N;+0,+1;$([N+](=O)([#6])[O-]),$(N(=O)([#6])=O):1](=[O:2])=,-[O;+0,-1:3]>>[NH2,+0:1]": + if ( + len(LabelsLIST.split("|No|")) == 2 + or keepSynthonsWithPG + or PGremovalRule + == "[N;+0,+1;$([N+](=O)([#6])[O-]),$(N(=O)([#6])=O):1](=[O:2])=,-[O;+0,-1:3]>>[NH2,+0:1]" + ): for synth in SynthonsWithoutPG: if synth not in finalSynthons: finalSynthons[synth] = SynthonsWithoutPG[synth].copy() @@ -475,7 +701,7 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, finalSynthons[synth] = synthonsBetweenPGremoval[synth].copy() else: finalSynthons[synth].update(synthonsBetweenPGremoval[synth]) - if len(LabelsLIST.split("|No|"))==3: + if len(LabelsLIST.split("|No|")) == 3: cuttingRule = Reactions.ReactionFromSmarts(PGremovalRule) PGlable = LabelsLIST.split("|")[secondStopInd] SynthonsWithout2PG = {} @@ -497,11 +723,22 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, else: finalSynthons[synth].update(SynthonsWithout2PG[synth]) LabelsLast = LabelsLIST.split("|No|")[2] - ReactionLast = ReactionLIST.split("|")[len(LabelsLISTBeforePGRemoval.split("|")) + 1 + len(LabelsLISTBetweenPGRemoval.split("|"))+1:] + ReactionLast = ReactionLIST.split("|")[ + len(LabelsLISTBeforePGRemoval.split("|")) + + 1 + + len(LabelsLISTBetweenPGRemoval.split("|")) + + 1 : + ] for newSynthon in SynthonsWithout2PG: - lastSynthons = __NormalSynthonsGenerator(LabelsLast,"|".join(ReactionLast), - PreviousClasses, CurrentClass, Chem.MolFromSmiles(newSynthon), func=1) - #lastSynthons.extend(__FirstReactionAsPrep(LabelsLast, "|".join(ReactionLast),Chem.MolFromSmiles(newSynthon), func=1, Class=Class) + lastSynthons = __NormalSynthonsGenerator( + LabelsLast, + "|".join(ReactionLast), + PreviousClasses, + CurrentClass, + Chem.MolFromSmiles(newSynthon), + func=1, + ) + # lastSynthons.extend(__FirstReactionAsPrep(LabelsLast, "|".join(ReactionLast),Chem.MolFromSmiles(newSynthon), func=1, Class=Class) if lastSynthons: for synth in lastSynthons: if synth not in finalSynthons: @@ -510,13 +747,19 @@ def __ProtectiveGroupRemoval(LabelsLIST, ReactionLIST, mol, keepSynthonsWithPG, finalSynthons[synth].update(lastSynthons[synth]) return finalSynthons -def __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClass, mol, func=1, usedInds = None): + +def __NormalSynthonsGenerator( + LabelsLIST, ReactionLIST, PreviousClasses, CurrentClass, mol, func=1, usedInds=None +): if usedInds == None: usedInds = [] labledSynthons = {} pat = re.compile("\[\w*:\w*\]") molSmiles = Chem.MolToSmiles(mol, canonical=True) - marksPrevious = [molSmiles[m.start():m.start() + 2] + molSmiles[m.end() - 4:m.end()] for m in re.finditer(pat, molSmiles)] + marksPrevious = [ + molSmiles[m.start() : m.start() + 2] + molSmiles[m.end() - 4 : m.end()] + for m in re.finditer(pat, molSmiles) + ] for ind, rule in enumerate(ReactionLIST.split("|")): if ind not in usedInds: try: @@ -531,14 +774,16 @@ def __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, Current for productSet in products: for product in productSet: labledSynthon = __getBBLabledSmiles(product, Label) - if labledSynthon==None: + if labledSynthon == None: print(Chem.MolToSmiles(mol, canonical=True)) exit() if marksPrevious: for synth in labledSynthon: - marksNew = [synth[m.start():m.start() + 2] + synth[m.end() - 4:m.end()] for m in - re.finditer(pat, synth)] - if len(marksNew)>len(marksPrevious): + marksNew = [ + synth[m.start() : m.start() + 2] + synth[m.end() - 4 : m.end()] + for m in re.finditer(pat, synth) + ] + if len(marksNew) > len(marksPrevious): if synth not in labledSynthons: labledSynthons[synth] = set() labledSynthons[synth].update(PreviousClasses) @@ -553,13 +798,27 @@ def __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, Current if func == 2: newMol = Chem.MolFromSmiles(labledSynthon[0]) usedInds.append(ind) - newSynthons = __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClass, newMol, func=1, - usedInds=usedInds) + newSynthons = __NormalSynthonsGenerator( + LabelsLIST, + ReactionLIST, + PreviousClasses, + CurrentClass, + newMol, + func=1, + usedInds=usedInds, + ) elif func == 3: newMol = Chem.MolFromSmiles(labledSynthon[0]) usedInds.append(ind) - newSynthons = __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClass, newMol, func=2, - usedInds=usedInds) + newSynthons = __NormalSynthonsGenerator( + LabelsLIST, + ReactionLIST, + PreviousClasses, + CurrentClass, + newMol, + func=2, + usedInds=usedInds, + ) if newSynthons: for synth in newSynthons: if synth not in labledSynthons: @@ -568,14 +827,21 @@ def __NormalSynthonsGenerator(LabelsLIST, ReactionLIST, PreviousClasses, Current labledSynthons[synth].update(newSynthons[synth]) return labledSynthons + def __FirstReactionAsPrep(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClass, mol, func): rule = ReactionLIST.split("|")[0] cuttingRule = Reactions.ReactionFromSmarts(rule) products = cuttingRule.RunReactants((mol,)) if not products: - if len(LabelsLIST.split("|"))>1: - return __FirstReactionAsPrep("|".join(LabelsLIST.split("|")[1:]), "|".join(ReactionLIST.split("|")[1:]), - PreviousClasses, CurrentClass, mol, func) + if len(LabelsLIST.split("|")) > 1: + return __FirstReactionAsPrep( + "|".join(LabelsLIST.split("|")[1:]), + "|".join(ReactionLIST.split("|")[1:]), + PreviousClasses, + CurrentClass, + mol, + func, + ) else: return {Chem.MolToSmiles(mol, canonical=True): PreviousClasses} else: @@ -591,7 +857,7 @@ def __FirstReactionAsPrep(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClas synthonsAsInpForTheNextStep[synth].update(PreviousClasses) synthonsAsInpForTheNextStep[synth].add(CurrentClass) - if len(ReactionLIST.split("|"))==1: + if len(ReactionLIST.split("|")) == 1: return synthonsAsInpForTheNextStep if not synthonsAsInpForTheNextStep and "Boronics" in CurrentClass: rule = ReactionLIST.split("|")[1] @@ -608,8 +874,14 @@ def __FirstReactionAsPrep(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClas synthonsAsInpForTheNextStep[synth].update(PreviousClasses) synthonsAsInpForTheNextStep[synth].add(CurrentClass) for synth in synthonsAsInpForTheNextStep: - labledSynthons = __NormalSynthonsGenerator("|".join(LabelsLIST.split("|")[1:]), "|".join(ReactionLIST.split("|")[1:]), - PreviousClasses, CurrentClass, Chem.MolFromSmiles(synth), func=func - 1) + labledSynthons = __NormalSynthonsGenerator( + "|".join(LabelsLIST.split("|")[1:]), + "|".join(ReactionLIST.split("|")[1:]), + PreviousClasses, + CurrentClass, + Chem.MolFromSmiles(synth), + func=func - 1, + ) if labledSynthons: for synth in labledSynthons: if synth not in synthonsAsInpForTheNextStep: @@ -619,8 +891,14 @@ def __FirstReactionAsPrep(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClas return synthonsAsInpForTheNextStep lastSynthons = {} for synth in synthonsAsInpForTheNextStep: - labledSynthons = __NormalSynthonsGenerator("|".join(LabelsLIST.split("|")[1:]), "|".join(ReactionLIST.split("|")[1:]), - PreviousClasses, CurrentClass, Chem.MolFromSmiles(synth), func=func-1) + labledSynthons = __NormalSynthonsGenerator( + "|".join(LabelsLIST.split("|")[1:]), + "|".join(ReactionLIST.split("|")[1:]), + PreviousClasses, + CurrentClass, + Chem.MolFromSmiles(synth), + func=func - 1, + ) if labledSynthons: for synth in labledSynthons: if synth not in lastSynthons: @@ -634,14 +912,15 @@ def __FirstReactionAsPrep(LabelsLIST, ReactionLIST, PreviousClasses, CurrentClas synthonsAsInpForTheNextStep[synth].update(lastSynthons[synth]) return synthonsAsInpForTheNextStep -def __getBBLabledSmiles(productMolecule: Chem.rdchem.Mol, Label:str): + +def __getBBLabledSmiles(productMolecule: Chem.rdchem.Mol, Label: str): productSmiles = Chem.MolToSmiles(productMolecule, canonical=True) labeledSmilesList = [] if Label != "No": for sublabel in Label.split(","): if productSmiles.find(sublabel.split("->")[0]) != -1: labeledSmiles = checkLable(productSmiles, sublabel) - if labeledSmiles==None: + if labeledSmiles == None: return None if "*" in labeledSmiles: productSmiles = labeledSmiles @@ -650,38 +929,43 @@ def __getBBLabledSmiles(productMolecule: Chem.rdchem.Mol, Label:str): labeledSmilesList.append(labeledSmiles) if labeledSmilesList: return list(set(labeledSmilesList)) - #print("WARNING! No lable was assigned to the smiles: " + productSmiles) + # print("WARNING! No lable was assigned to the smiles: " + productSmiles) return [productSmiles] + def __getReactionSMARTS(BB_Marks: ET.Element): MarksSetup = {} for child in BB_Marks: for subCh in child: - if subCh.get('SMARTS'): + if subCh.get("SMARTS"): MarksSetup[child.tag + "_" + subCh.tag] = {} - MarksSetup[child.tag + "_" + subCh.tag]["SMARTS"] = subCh.get('SMARTS') - MarksSetup[child.tag + "_" + subCh.tag]["Labels"] = subCh.get('Labels') + MarksSetup[child.tag + "_" + subCh.tag]["SMARTS"] = subCh.get("SMARTS") + MarksSetup[child.tag + "_" + subCh.tag]["Labels"] = subCh.get("Labels") return MarksSetup + def generateScaffoldForBB(smiles, returnObjects=False): scaffold = None mol = None - PGdict = {"NCbz": "[N:1][C;$(C(=O)O[CH2]c1[cH][cH][cH][cH][cH]1):2]>>[N:1]", - "NFmoc": "[N:1][C;$(C(=O)O[CH2][CH]1c2[cH][cH][cH][cH]c2-c3[cH][cH][cH][cH]c13):2]>>[N:1]", - "NBnz": "[N;+0;$(N[CH2]c1[cH][cH][cH][cH][cH]1);!$(N[C,S,P]=[O,S,N]):1][C;$([CH2]c1[cH][cH][cH][cH][cH]1):2]>>[N:1]", - "COOBnz": "[O;$(O(C)C([#6])=O):1][C;$([CH2]c1[cH][cH][cH][cH][cH]1):2]>>[OH:1]", - "Boronics": "[B;$(B(O@C)O@C):1][#6:2]>>[#6:2]", - "Oxiranes": "[C:1]1[O:2][C:3]1>>[C:1]([OH:2])[C;+0:3]"} + PGdict = { + "NCbz": "[N:1][C;$(C(=O)O[CH2]c1[cH][cH][cH][cH][cH]1):2]>>[N:1]", + "NFmoc": "[N:1][C;$(C(=O)O[CH2][CH]1c2[cH][cH][cH][cH]c2-c3[cH][cH][cH][cH]c13):2]>>[N:1]", + "NBnz": "[N;+0;$(N[CH2]c1[cH][cH][cH][cH][cH]1);!$(N[C,S,P]=[O,S,N]):1][C;$([CH2]c1[cH][cH][cH][cH][cH]1):2]>>[N:1]", + "COOBnz": "[O;$(O(C)C([#6])=O):1][C;$([CH2]c1[cH][cH][cH][cH][cH]1):2]>>[OH:1]", + "Boronics": "[B;$(B(O@C)O@C):1][#6:2]>>[#6:2]", + "Oxiranes": "[C:1]1[O:2][C:3]1>>[C:1]([OH:2])[C;+0:3]", + } mol = readMol(smiles) if mol: for pg in PGdict: mol = __removePGforScaffolds(PGdict[pg], mol) scaffold = MurckoScaffoldSmiles(mol=mol) if returnObjects: - return scaffold,mol + return scaffold, mol else: return scaffold + def __removePGforScaffolds(reactionRule, mol): q = Chem.MolFromSmarts(reactionRule.split(">>")[0]) cuttingRule = Reactions.ReactionFromSmarts(reactionRule) @@ -693,6 +977,7 @@ def __removePGforScaffolds(reactionRule, mol): mol.GetRingInfo().NumRings() return mol + """def __checkLable(productSmiles:str, Label:str): goodValenceSmiles = None if Label.split("->")[0][1] == "S": @@ -732,4 +1017,3 @@ def __removePGforScaffolds(reactionRule, mol): print("Problem with structure check: " + productSmiles + " " + out) return None return generateMajorTautFromSynthonSmiles(goodValenceSmiles)""" - diff --git a/src/SyntOn_Classifier.py b/src/SyntOn_Classifier.py index cfafcbb..6d1858e 100644 --- a/src/SyntOn_Classifier.py +++ b/src/SyntOn_Classifier.py @@ -1,10 +1,15 @@ +import json +import os +import sys + from rdkit import Chem from rdkit.Chem.rdMolDescriptors import * -import os, json,sys + srcPath = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(1, srcPath) from UsefulFunctions import * + def main(args): for ind, line in enumerate(open(args.input)): sline = line.strip() @@ -25,29 +30,37 @@ def main(args): def BBClassifier(molSmiles=None, mol=None): - SMARTSLib = os.path.join(os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], "config" ,"SMARTSLibNew.json") + SMARTSLib = os.path.join( + os.path.split(os.path.split(os.path.realpath(__file__))[0])[0], + "config", + "SMARTSLibNew.json", + ) Classes = [] with open(SMARTSLib) as input: SmartsLib = json.load(input) - if molSmiles!=None and mol==None: + if molSmiles != None and mol == None: mol = readMol(molSmiles) if mol != None: mol.UpdatePropertyCache() else: print(molSmiles + " was not processed by rdkit") return None - elif molSmiles==None and mol==None: + elif molSmiles == None and mol == None: print("ERROR! Input Smiles or Mol object should be provided") exit() for bigClass in SmartsLib: for subClass in SmartsLib[bigClass]: - if __classChecker(SmartsLib[bigClass][subClass]["ShouldContainAtLeastOne"], - SmartsLib[bigClass][subClass]["ShouldAlsoContain"], - SmartsLib[bigClass][subClass]["shouldNotContain"], mol): + if __classChecker( + SmartsLib[bigClass][subClass]["ShouldContainAtLeastOne"], + SmartsLib[bigClass][subClass]["ShouldAlsoContain"], + SmartsLib[bigClass][subClass]["shouldNotContain"], + mol, + ): Classes.append(bigClass + "_" + subClass) return Classes + def __classChecker(ShouldContainAtLeastOne, ShouldAlsoContain, shouldNotContain, mol): match = False for query1 in ShouldContainAtLeastOne: @@ -61,27 +74,28 @@ def __classChecker(ShouldContainAtLeastOne, ShouldAlsoContain, shouldNotContain, break if match and shouldNotContain: for query3 in shouldNotContain: - q3 =Chem.MolFromSmarts(query3) - #q3.UpdatePropertyCache() + q3 = Chem.MolFromSmarts(query3) + # q3.UpdatePropertyCache() ttt = mol.HasSubstructMatch(q3) if ttt: match = False return match - -if __name__ == '__main__': +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Classification of building blocks. Separates provided library into several sublibraries according to the reagents classess.", - epilog="Code implementation: Yuliana Zabolotna, Alexandre Varnek\n" - " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" - "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" - " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" - " Kyiv National Taras Shevchenko University\n" - "2021 Strasbourg, Kiev", - prog="SyntOn_Classifier", formatter_class=argparse.RawTextHelpFormatter) + parser = argparse.ArgumentParser( + description="Classification of building blocks. Separates provided library into several sublibraries according to the reagents classess.", + epilog="Code implementation: Yuliana Zabolotna, Alexandre Varnek\n" + " Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n" + "Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n" + " Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n" + " Kyiv National Taras Shevchenko University\n" + "2021 Strasbourg, Kiev", + prog="SyntOn_Classifier", + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument("-i", "--input", type=str, help="Input SMILES file") args = parser.parse_args() main(args) - diff --git a/src/UsefulFunctions.py b/src/UsefulFunctions.py index e1fce3b..bdb9f75 100644 --- a/src/UsefulFunctions.py +++ b/src/UsefulFunctions.py @@ -1,16 +1,24 @@ +import datetime +import os +import random +import re +import resource +import sys +import time import xml.etree.ElementTree as ET +from collections import Counter +from multiprocessing import Process, Queue + from rdkit import Chem, DataStructs -from rdkit.Chem import rdChemReactions as Reactions from rdkit.Chem import AddHs, AllChem -from rdkit.Chem.MolStandardize import rdMolStandardize -from rdkit.Chem.rdmolops import * #is needed -from rdkit.Chem.rdMolDescriptors import CalcNumRings +from rdkit.Chem import rdChemReactions as Reactions +from rdkit.Chem.Crippen import MolLogP from rdkit.Chem.Descriptors import * +from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem.rdMolDescriptors import * -from rdkit.Chem.Crippen import MolLogP -import datetime, os, time, random, re, resource, sys -from multiprocessing import Process, Queue -from collections import Counter +from rdkit.Chem.rdMolDescriptors import CalcNumRings +from rdkit.Chem.rdmolops import * # is needed + def readMol(smiles): try: @@ -66,18 +74,20 @@ def readMol(smiles): else: return None + def countLines(file): lines = 0 f = open(file, "rb") - buf_size = 1024*1024 + buf_size = 1024 * 1024 read_f = f.raw.read buf = read_f(buf_size) while buf: - lines += buf.count(b'\n') + lines += buf.count(b"\n") buf = read_f(buf_size) f.close() return lines + def splitFileByLines(inpFile, outName, linesPerFile): n = 0 smallFile = None @@ -86,7 +96,7 @@ def splitFileByLines(inpFile, outName, linesPerFile): if ind % linesPerFile == 0: if smallFile: smallFile.close() - n+=1 + n += 1 smallFile = open(outName + "_" + str(n), "w") outNamesList.append(outName + "_" + str(n)) smallFile.write(line) @@ -94,6 +104,7 @@ def splitFileByLines(inpFile, outName, linesPerFile): smallFile.close() return outNamesList + def listDir(path): d_names = [] f_names = [] @@ -104,13 +115,20 @@ def listDir(path): break return d_names, f_names, main_dir + def Ro2Filtration(synthonSmiles): mol = Chem.MolFromSmiles(synthonSmiles) functionality = synthonSmiles.count(":") Bivalent_electrophilic = synthonSmiles.count(":30") Bivalent_nucleophilic = synthonSmiles.count(":40") Bivalent_neutral = synthonSmiles.count(":50") - MolW = ExactMolWt(mol) - functionality - Bivalent_electrophilic - Bivalent_nucleophilic - Bivalent_neutral + MolW = ( + ExactMolWt(mol) + - functionality + - Bivalent_electrophilic + - Bivalent_nucleophilic + - Bivalent_neutral + ) LogP = MolLogP(mol) HDC = CalcNumHBD(mol) HAC = CalcNumHBA(mol) @@ -121,9 +139,20 @@ def Ro2Filtration(synthonSmiles): count += synthonSmiles.count(m) HDC -= count if MolW > 200 or LogP > 2 or HDC > 2 or HAC > 4: - return False, ["MolW=" + str(MolW), "LogP=" + str(LogP), "HDC=" + str(HDC), "HAC=" + str(HAC)] + return False, [ + "MolW=" + str(MolW), + "LogP=" + str(LogP), + "HDC=" + str(HDC), + "HAC=" + str(HAC), + ] else: - return True, ["MolW=" + str(MolW), "LogP=" + str(LogP), "HDC=" + str(HDC), "HAC=" + str(HAC)] + return True, [ + "MolW=" + str(MolW), + "LogP=" + str(LogP), + "HDC=" + str(HDC), + "HAC=" + str(HAC), + ] + def CheckMolStructure(goodValenceSmiles, label): vallences = {"C": 4, "N": 3, "N+": 4, "O": 2, "S:10": 6, "S:20": 2} @@ -140,14 +169,18 @@ def CheckMolStructure(goodValenceSmiles, label): if atom.GetTotalValence() < vallences[symbol]: return False elif symbol == "S": - if atom.GetTotalValence() < vallences[symbol + ":" + str(atom.GetAtomMapNum())]: + if ( + atom.GetTotalValence() + < vallences[symbol + ":" + str(atom.GetAtomMapNum())] + ): return False elif atom.GetTotalValence() < vallences["N+"]: - return False + return False return True else: return False + def generateMajorTautFromSynthonSmiles(initSmiles): enumerator = rdMolStandardize.TautomerEnumerator() initMol = Chem.MolFromSmiles(initSmiles) @@ -171,52 +204,85 @@ def generateMajorTautFromSynthonSmiles(initSmiles): else: return initSmiles -def checkLable(productSmiles:str, Label:str): - goodValenceSmiles = None - if Label.split("->")[0][1] == "S": - hCount = 1 - out = productSmiles.replace(Label.split("->")[0], - "[" + Label.split("->")[1].split(":")[0] + "H" + str(hCount) + ":" + - Label.split("->")[1].split(":")[1] + "]") - goodValenceSmiles = out - else: - for hCount in range(1, 5): - if hCount == 1: - if "+" in Label and "H" not in Label: - out = productSmiles.replace(Label.split("->")[0], - "[" + Label.split("->")[1].split(":")[0] + "+:" + - Label.split("->")[1].split(":")[1] + "]") - - else: - out = productSmiles.replace(Label.split("->")[0], - "[" + Label.split("->")[1].split(":")[0] + ":" + - Label.split("->")[1].split(":")[1] + "]") - - check = CheckMolStructure(out, Label) - if check: - goodValenceSmiles = out - break + +def checkLable(productSmiles: str, Label: str): + goodValenceSmiles = None + if Label.split("->")[0][1] == "S": + hCount = 1 + out = productSmiles.replace( + Label.split("->")[0], + "[" + + Label.split("->")[1].split(":")[0] + + "H" + + str(hCount) + + ":" + + Label.split("->")[1].split(":")[1] + + "]", + ) + goodValenceSmiles = out + else: + for hCount in range(1, 5): + if hCount == 1: if "+" in Label and "H" not in Label: - out = productSmiles.replace(Label.split("->")[0], - "[" + Label.split("->")[1].split(":")[0] + "H" + str(hCount) + "+:" + - Label.split("->")[1].split(":")[1] + "]") - else: - out = productSmiles.replace(Label.split("->")[0], - "[" + Label.split("->")[1].split(":")[0] + "H" + str(hCount) + ":" + - Label.split("->")[1].split(":")[1] + "]") + out = productSmiles.replace( + Label.split("->")[0], + "[" + + Label.split("->")[1].split(":")[0] + + "+:" + + Label.split("->")[1].split(":")[1] + + "]", + ) - newMol = Chem.MolFromSmiles(out) - if not newMol: - break else: + out = productSmiles.replace( + Label.split("->")[0], + "[" + + Label.split("->")[1].split(":")[0] + + ":" + + Label.split("->")[1].split(":")[1] + + "]", + ) + + check = CheckMolStructure(out, Label) + if check: goodValenceSmiles = out - check = CheckMolStructure(goodValenceSmiles, Label) - if check: - break - if not goodValenceSmiles: - print("Problem with structure check: " + productSmiles + " " + out) - else: - return generateMajorTautFromSynthonSmiles(goodValenceSmiles) + break + if "+" in Label and "H" not in Label: + out = productSmiles.replace( + Label.split("->")[0], + "[" + + Label.split("->")[1].split(":")[0] + + "H" + + str(hCount) + + "+:" + + Label.split("->")[1].split(":")[1] + + "]", + ) + else: + out = productSmiles.replace( + Label.split("->")[0], + "[" + + Label.split("->")[1].split(":")[0] + + "H" + + str(hCount) + + ":" + + Label.split("->")[1].split(":")[1] + + "]", + ) + + newMol = Chem.MolFromSmiles(out) + if not newMol: + break + else: + goodValenceSmiles = out + check = CheckMolStructure(goodValenceSmiles, Label) + if check: + break + if not goodValenceSmiles: + print("Problem with structure check: " + productSmiles + " " + out) + else: + return generateMajorTautFromSynthonSmiles(goodValenceSmiles) + def readSyntonLib(synthLibFile, Ro2Filtration=False, FindAnaloguesOfMissingBBs=False): fragBegTime = datetime.datetime.now() @@ -232,19 +298,32 @@ def readSyntonLib(synthLibFile, Ro2Filtration=False, FindAnaloguesOfMissingBBs=F LogP = MolLogP(mol) HDC = CalcNumHBD(mol) HAC = CalcNumHBA(mol) - if MolW>200 or LogP > 2 or HDC > 2 or HAC > 4: + if MolW > 200 or LogP > 2 or HDC > 2 or HAC > 4: continue availableBBs[sline.split()[0]] = {} availableBBs[sline.split()[0]]["BBs"] = sline.split()[1] if FindAnaloguesOfMissingBBs: - availableBBs[sline.split()[0]]["fp_b"] = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) + availableBBs[sline.split()[0]]["fp_b"] = AllChem.GetMorganFingerprintAsBitVect( + mol, radius=2, nBits=2048 + ) availableBBs[sline.split()[0]]["n_atoms"] = mol.GetNumAtoms() availableBBs[sline.split()[0]]["n_rings"] = CalcNumRings(mol) availableBBs[sline.split()[0]]["marks"] = sorted( - [sline.split()[0][m.start():m.start() + 2] + sline.split()[0][m.end() - 4:m.end()] for m in - re.finditer(pat, sline.split()[0])]) - availableBBs[sline.split()[0]]["marksVallences"] = "+".join(sorted([atom.GetSymbol() + ":" + - str(atom.GetTotalDegree()) for atom in mol.GetAtoms() if atom.GetAtomMapNum() != 0])) + [ + sline.split()[0][m.start() : m.start() + 2] + + sline.split()[0][m.end() - 4 : m.end()] + for m in re.finditer(pat, sline.split()[0]) + ] + ) + availableBBs[sline.split()[0]]["marksVallences"] = "+".join( + sorted( + [ + atom.GetSymbol() + ":" + str(atom.GetTotalDegree()) + for atom in mol.GetAtoms() + if atom.GetAtomMapNum() != 0 + ] + ) + ) print("Lib BB reading time:") print(datetime.datetime.now() - fragBegTime) return availableBBs diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..97b7f53 --- /dev/null +++ b/tox.ini @@ -0,0 +1,24 @@ +[tox] +envlist = + lint + flake8 + +[testenv:lint] +deps = + black + isort +skip_install = true +commands = + black . -l 100 + isort . --profile=black +description = Run linters. + +[testenv:flake8] +skip_install = true +deps = + flake8 + flake8-black + flake8-isort +commands = + flake8 . --select I001,I002,I003,I004,I005,BLK100 +description = Run the flake8 code quality assurance tool