#
# MIT License
#
# Copyright (c) 2022 GT4SD team
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import logging
from dataclasses import field
from typing import Any, Callable, ClassVar, Dict, Iterable, Optional, TypeVar
from ....domains.materials import SMILES, MoleculeFormat, validate_molecules
from ....exceptions import InvalidItem
from ....training_pipelines.core import TrainingPipelineArguments
from ....training_pipelines.guacamol_baselines.core import GuacaMolSavingArguments
from ....training_pipelines.moses.core import MosesSavingArguments
from ...core import AlgorithmConfiguration, GeneratorAlgorithm
from ...registry import ApplicationsRegistry
from .implementation import (
AaeIterator,
Generator,
GraphGAIterator,
GraphMCTSIterator,
OrganIterator,
SMILESGAIterator,
SMILESLSTMHCIterator,
SMILESLSTMPPOIterator,
VaeIterator,
)
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
T = TypeVar("T", bound=Any)
S = TypeVar("S", bound=Any)
Targeted = Callable[[T], Iterable[Any]]
[docs]class GuacaMolGenerator(GeneratorAlgorithm[S, T]):
"""GuacaMol generation algorithm."""
[docs] def __init__(
self,
configuration: AlgorithmConfiguration[S, T],
target: Optional[T],
):
"""
Instantiate GuacaMolGenerator ready to generate samples.
Args:
configuration: domain and application
specification defining parameters, types and validations.
target: a target for which to generate items.
Example:
An example for generating molecules given a scoring function and a score::
config = SMILESGAGenerator()
target = {"scoring_function_name": {"target": 0.0}}
algorithm = GuacaMolGenerator(configuration=config, target=target)
items = list(algorithm.sample(1))
print(items)
"""
configuration = self.validate_configuration(configuration)
# TODO there might also be a validation/check on the target input
super().__init__(
configuration=configuration, # type:ignore
target=target, # type:ignore
)
[docs] def get_generator(
self,
configuration: AlgorithmConfiguration[S, T],
target: Optional[T],
) -> Targeted[T]:
"""Get the function to perform the prediction via GuacaMol's generator.
Args:
configuration: helps to set up specific application of GuacaMol.
Returns:
callable with target generating samples.
"""
logger.info("ensure artifacts for the application are present.")
self.local_artifacts = configuration.ensure_artifacts()
implementation: Generator = configuration.get_conditional_generator( # type: ignore
self.local_artifacts
)
return implementation.generate_batch # type: ignore
[docs]class GuacaMolAbstractGenerator(AlgorithmConfiguration[str, str]):
[docs] def validate_item(self, item: str) -> SMILES:
"""Check that item is a valid SMILES.
Args:
item: a generated item that is possibly not valid.
Raises:
InvalidItem: in case the item can not be validated.
Returns:
the validated SMILES.
"""
(
molecules,
_,
) = validate_molecules([item], MoleculeFormat.smiles)
if molecules[0] is None:
raise InvalidItem(
title="InvalidSMILES",
detail=f'rdkit.Chem.MolFromSmiles returned None for "{item}"',
)
return SMILES(item)
[docs]@ApplicationsRegistry.register_algorithm_application(GuacaMolGenerator)
class SMILESGAGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate optimizied molecules using SMILES Genetic algorithm"""
algorithm_name: ClassVar[str] = GuacaMolGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
batch_size: int = field(
default=32,
metadata=dict(description="Batch size used for the generative model sampling."),
)
population_size: int = field(
default=100,
metadata=dict(
description="it is used with n_mutations for the initial generation of smiles within the population"
),
)
n_mutations: int = field(
default=200,
metadata=dict(
description="it is used with population size for the initial generation of smiles within the population"
),
)
n_jobs: int = field(
default=-1,
metadata=dict(description="number of concurrently running jobs"),
)
gene_size: int = field(
default=2,
metadata=dict(
description="size of the gene which is used in creation of genes"
),
)
random_start: bool = field(
default=False,
metadata=dict(
description="set to True to randomly choose list of SMILES for generating optimizied molecules"
),
)
generations: int = field(
default=2,
metadata=dict(description="number of evolutionary generations"),
)
patience: int = field(
default=4,
metadata=dict(
description="it is used for early stopping if population scores remains the same after generating molecules"
),
)
[docs] def get_target_description(self) -> Dict[str, str]:
"""Get description of the target for generation.
Returns:
target description.
"""
return {
"title": "Scoring functions with parameters",
"description": "Scoring functions will be used to generate a score for SMILES.",
"type": "object",
}
[docs] def get_conditional_generator(self, resources_path: str) -> SMILESGAIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.smiles_ga.SMILESGAIterator.generate_batch>` method for targeted generation.
"""
return SMILESGAIterator(
resource_path=resources_path,
population_size=self.population_size,
n_mutations=self.n_mutations,
n_jobs=self.n_jobs,
random_start=self.random_start,
gene_size=self.gene_size,
generations=self.generations,
patience=self.patience,
batch_size=self.batch_size,
)
[docs]@ApplicationsRegistry.register_algorithm_application(GuacaMolGenerator)
class GraphGAGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate optimizied molecules using Graph-Based Genetic algorithm"""
algorithm_name: ClassVar[str] = GuacaMolGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
batch_size: int = field(
default=1,
metadata=dict(description="Batch size used for the generative model sampling."),
)
population_size: int = field(
default=100,
metadata=dict(
description="it is used with n_mutations for the initial generation of smiles within the population"
),
)
mutation_rate: float = field(
default=0.01,
metadata=dict(
description="frequency of the new mutations in a single gene or organism over time"
),
)
offspring_size: int = field(
default=200,
metadata=dict(description="number of molecules to select for new population"),
)
n_jobs: int = field(
default=-1,
metadata=dict(description="number of concurrently running jobs"),
)
random_start: bool = field(
default=False,
metadata=dict(
description="set to True to randomly choose list of SMILES for generating optimizied molecules"
),
)
generations: int = field(
default=2,
metadata=dict(description="number of evolutionary generations"),
)
patience: int = field(
default=4,
metadata=dict(
description="it is used for early stopping if population scores remains the same after generating molecules"
),
)
[docs] def get_target_description(self) -> Dict[str, str]:
"""Get description of the target for generation.
Returns:
target description.
"""
return {
"title": "Scoring functions with parameters",
"description": "Scoring functions will be used to generate a score for SMILES.",
"type": "object",
}
[docs] def get_conditional_generator(self, resources_path: str) -> GraphGAIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.graph_ga.GraphGAIterator.generate_batch>` method for targeted generation.
"""
return GraphGAIterator(
resource_path=resources_path,
batch_size=self.batch_size,
offspring_size=self.offspring_size,
population_size=self.population_size,
mutation_rate=self.mutation_rate,
n_jobs=self.n_jobs,
random_start=self.random_start,
generations=self.generations,
patience=self.patience,
)
[docs]@ApplicationsRegistry.register_algorithm_application(GuacaMolGenerator)
class GraphMCTSGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate optimizied molecules using Graph-based Genetic Algorithm and Generative Model/Monte Carlo Tree Search for the Exploration of Chemical Space"""
algorithm_name: ClassVar[str] = GuacaMolGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
batch_size: int = field(
default=1,
metadata=dict(description="Batch size used for the generative model sampling."),
)
init_smiles: str = field(
default="",
metadata=dict(description="initial SMILES used for generation of states."),
)
population_size: int = field(
default=100,
metadata=dict(
description="it is used with n_mutations for the initial generation of smiles within the population"
),
)
n_jobs: int = field(
default=-1,
metadata=dict(description="number of concurrently running jobs"),
)
generations: int = field(
default=1000,
metadata=dict(description="number of evolutionary generations"),
)
patience: int = field(
default=4,
metadata=dict(
description="it is used for early stopping if population scores remains the same after generating molecules"
),
)
num_sims: float = field(
default=40,
metadata=dict(description="number of times to traverse the tree"),
)
max_children: int = field(
default=25,
metadata=dict(description="maximum number of childerns a node could have"),
)
max_atoms: int = field(
default=60,
metadata=dict(
description="maximum number of atoms to explore to terminal the node state"
),
)
[docs] def get_target_description(self) -> Dict[str, str]:
"""Get description of the target for generation.
Returns:
target description.
"""
return {
"title": "Scoring functions with parameters",
"description": "Scoring functions will be used to generate a score for SMILES.",
"type": "object",
}
[docs] def get_conditional_generator(self, resources_path: str) -> GraphMCTSIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.graph_mcts.GraphMCTSIterator.generate_batch>` method for targeted generation.
"""
return GraphMCTSIterator(
init_smiles=self.init_smiles,
batch_size=self.batch_size,
population_size=self.population_size,
max_children=self.max_children,
num_sims=self.num_sims,
generations=self.generations,
n_jobs=self.n_jobs,
max_atoms=self.max_atoms,
patience=self.patience,
)
[docs]@ApplicationsRegistry.register_algorithm_application(GuacaMolGenerator)
class SMILESLSTMHCGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate optimized molecules using recurrent neural networks with hill climbing algorithm."""
algorithm_name: ClassVar[str] = GuacaMolGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
batch_size: int = field(
default=1,
metadata=dict(description="Batch size used for the generative model sampling."),
)
n_jobs: int = field(
default=-1,
metadata=dict(description="number of concurrently running jobs"),
)
n_epochs: int = field(
default=20,
metadata=dict(description="number of epochs to sample"),
)
mols_to_sample: int = field(
default=1024,
metadata=dict(description="molecules sampled at each step"),
)
keep_top: int = field(
default=512,
metadata=dict(description="maximum length of a SMILES string"),
)
optimize_n_epochs: int = field(
default=2,
metadata=dict(description="number of epochs for the optimization"),
)
max_len: int = field(
default=100,
metadata=dict(description="maximum length of a SMILES string"),
)
optimize_batch_size: int = field(
default=256,
metadata=dict(description="batch size for the optimization"),
)
benchmark_num_samples: int = field(
default=4096,
metadata=dict(
description="number of molecules to generate from final model for the benchmark"
),
)
random_start: bool = field(
default=False,
metadata=dict(
description="set to True to randomly choose list of SMILES for generating optimizied molecules"
),
)
[docs] def get_target_description(self) -> Dict[str, str]:
"""Get description of the target for generation.
Returns:
target description.
"""
return {
"title": "Scoring functions with parameters",
"description": "Scoring functions will be used to generate a score for SMILES.",
"type": "object",
}
[docs] def get_conditional_generator(self, resources_path: str) -> SMILESLSTMHCIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.smiles_lstm_hc.SMILESLSTMHCIterator.generate_batch>` method for targeted generation.
"""
return SMILESLSTMHCIterator(
resource_path=resources_path,
batch_size=self.batch_size,
n_epochs=self.n_epochs,
mols_to_sample=self.mols_to_sample,
keep_top=self.keep_top,
optimize_n_epochs=self.optimize_n_epochs,
max_len=self.max_len,
optimize_batch_size=self.optimize_batch_size,
benchmark_num_samples=self.benchmark_num_samples,
random_start=self.random_start,
n_jobs=self.n_jobs,
)
[docs] @classmethod
def get_filepath_mappings_for_training_pipeline_arguments(
cls, training_pipeline_arguments: TrainingPipelineArguments
) -> Dict[str, str]:
"""Ger filepath mappings for the given training pipeline arguments.
Args:
training_pipeline_arguments: training pipeline arguments.
Returns:
a mapping between artifacts' files and training pipeline's output files.
"""
if isinstance(training_pipeline_arguments, GuacaMolSavingArguments):
return {
"model_final_0.473.pt": training_pipeline_arguments.model_filepath,
"model_final_0.473.json": training_pipeline_arguments.model_config_filepath,
"guacamol_v1_all.smiles": "",
}
else:
return super().get_filepath_mappings_for_training_pipeline_arguments(
training_pipeline_arguments
)
[docs]@ApplicationsRegistry.register_algorithm_application(GuacaMolGenerator)
class SMILESLSTMPPOGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate optimizied molecules using recurrent neural networks with hill climbing algorithm"""
algorithm_name: ClassVar[str] = GuacaMolGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
batch_size: int = field(
default=1,
metadata=dict(description="Batch size used for the generative model sampling."),
)
num_epochs: int = field(
default=20,
metadata=dict(description="number of epochs to sample"),
)
episode_size: int = field(
default=8192,
metadata=dict(
description="number of molecules sampled by the policy at the start of a series of ppo updates"
),
)
optimize_batch_size: int = field(
default=1024,
metadata=dict(description="batch size for the optimization"),
)
entropy_weight: int = field(
default=1,
metadata=dict(description="used for calculating entropy loss"),
)
kl_div_weight: int = field(
default=10,
metadata=dict(
description="used for calculating Kullback-Leibler divergence loss"
),
)
clip_param: float = field(
default=0.2,
metadata=dict(
description="used for determining how far the new policy is from the old one"
),
)
[docs] def get_target_description(self) -> Dict[str, str]:
"""Get description of the target for generation.
Returns:
target description.
"""
return {
"title": "Scoring functions with parameters",
"description": "Scoring functions will be used to generate a score for SMILES.",
"type": "object",
}
[docs] def get_conditional_generator(self, resources_path: str) -> SMILESLSTMPPOIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.smiles_lstm_ppo.SMILESLSTMPPOIterator.generate_batch>` method for targeted generation.
"""
return SMILESLSTMPPOIterator(
resource_path=resources_path,
batch_size=self.batch_size,
num_epochs=self.num_epochs,
episode_size=self.episode_size,
optimize_batch_size=self.optimize_batch_size,
entropy_weight=self.entropy_weight,
kl_div_weight=self.kl_div_weight,
clip_param=self.clip_param,
)
[docs] @classmethod
def get_filepath_mappings_for_training_pipeline_arguments(
cls, training_pipeline_arguments: TrainingPipelineArguments
) -> Dict[str, str]:
"""Ger filepath mappings for the given training pipeline arguments.
Args:
training_pipeline_arguments: training pipeline arguments.
Returns:
a mapping between artifacts' files and training pipeline's output files.
"""
if isinstance(training_pipeline_arguments, GuacaMolSavingArguments):
return {
"model_final_0.473.pt": training_pipeline_arguments.model_filepath,
"model_final_0.473.json": training_pipeline_arguments.model_config_filepath,
}
else:
return super().get_filepath_mappings_for_training_pipeline_arguments(
training_pipeline_arguments
)
[docs]class MosesGenerator(GeneratorAlgorithm[S, T]):
"""Moses generation algorithm."""
[docs] def __init__(
self,
configuration: AlgorithmConfiguration[S, T],
target: Optional[T],
):
"""
Instantiate GuacaMolGenerator ready to generate samples.
Args:
configuration: domain and application
specification defining parameters, types and validations.
target: a target for which to generate items.
Example:
An example for generating molecules given a scoring function and a score:
config = AaeGenerator()
algorithm = MosesGenerator(configuration=config, target="")
items = list(algorithm.sample(1))
print(items)
"""
configuration = self.validate_configuration(configuration)
# TODO there might also be a validation/check on the target input
super().__init__(
configuration=configuration, # type:ignore
target=target, # type:ignore
)
[docs] def get_generator(
self,
configuration: AlgorithmConfiguration[S, T],
target: Optional[T],
) -> Targeted[T]:
"""Get the function to perform the prediction via GuacaMol's generator.
Args:
configuration: helps to set up specific application of GuacaMol.
Returns:
callable with target generating samples.
"""
logger.info("ensure artifacts for the application are present.")
self.local_artifacts = configuration.ensure_artifacts()
implementation: Generator = configuration.get_conditional_generator( # type: ignore
self.local_artifacts
)
return implementation.generate_batch # type: ignore
[docs]@ApplicationsRegistry.register_algorithm_application(MosesGenerator)
class AaeGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate molecules using an adversarial autoencoder."""
algorithm_name: ClassVar[str] = MosesGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
n_samples: int = field(
default=20,
metadata=dict(description="Number of SMILES to generate"),
)
n_batch: int = field(
default=1024,
metadata=dict(description="Batch size for the optimization"),
)
max_len: int = field(
default=100,
metadata=dict(description="Maximum length of the generated SMILES"),
)
[docs] def get_conditional_generator(self, resources_path: str) -> AaeIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.AaeIterator.generate_batch>` method for targeted generation.
"""
return AaeIterator(
resource_path=resources_path,
n_samples=self.n_samples,
n_batch=self.n_batch,
max_len=self.max_len,
)
[docs]@ApplicationsRegistry.register_algorithm_application(MosesGenerator)
class VaeGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate molecules using a variational autoencoder."""
algorithm_name: ClassVar[str] = MosesGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
n_samples: int = field(
default=20,
metadata=dict(description="Number of SMILES to generate"),
)
n_batch: int = field(
default=1024,
metadata=dict(description="Batch size for the optimization"),
)
max_len: int = field(
default=100,
metadata=dict(description="Maximum length of the generated SMILES"),
)
[docs] def get_conditional_generator(self, resources_path: str) -> VaeIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.VaeIterator.generate_batch>` method for targeted generation.
"""
return VaeIterator(
resource_path=resources_path,
n_samples=self.n_samples,
n_batch=self.n_batch,
max_len=self.max_len,
)
[docs] @classmethod
def get_filepath_mappings_for_training_pipeline_arguments(
cls, training_pipeline_arguments: TrainingPipelineArguments
) -> Dict[str, str]:
"""Ger filepath mappings for the given training pipeline arguments.
Args:
training_pipeline_arguments: training pipeline arguments.
Returns:
a mapping between artifacts' files and training pipeline's output files.
"""
if isinstance(training_pipeline_arguments, MosesSavingArguments):
return {
"model.pt": training_pipeline_arguments.model_path,
"config.pt": training_pipeline_arguments.config_path,
"vocab.pt": training_pipeline_arguments.vocab_path,
}
else:
return super().get_filepath_mappings_for_training_pipeline_arguments(
training_pipeline_arguments
)
[docs]@ApplicationsRegistry.register_algorithm_application(MosesGenerator)
class OrganGenerator(GuacaMolAbstractGenerator):
"""Configuration to generate molecules using Objective-Reinforced Generative Adversarial Network"""
algorithm_name: ClassVar[str] = MosesGenerator.__name__
algorithm_type: ClassVar[str] = "conditional_generation"
domain: ClassVar[str] = "materials"
algorithm_version: str = "v0"
n_samples: int = field(
default=20,
metadata=dict(description="Number of SMILES to generate"),
)
n_batch: int = field(
default=1024,
metadata=dict(description="Batch size for the optimization"),
)
max_len: int = field(
default=100,
metadata=dict(description="Maximum length of the generated SMILES"),
)
[docs] def get_conditional_generator(self, resources_path: str) -> OrganIterator:
"""Instantiate the actual generator implementation.
Args:
resources_path: local path to model files.
Returns:
instance with :meth:`generate_batch<gt4sd.algorithms.conditional_generation.guacamol.implementation.OrganIterator.generate_batch>` method for targeted generation.
"""
return OrganIterator(
resource_path=resources_path,
n_samples=self.n_samples,
n_batch=self.n_batch,
max_len=self.max_len,
)
[docs] @classmethod
def get_filepath_mappings_for_training_pipeline_arguments(
cls, training_pipeline_arguments: TrainingPipelineArguments
) -> Dict[str, str]:
"""Ger filepath mappings for the given training pipeline arguments.
Args:
training_pipeline_arguments: training pipeline arguments.
Returns:
a mapping between artifacts' files and training pipeline's output files.
"""
if isinstance(training_pipeline_arguments, MosesSavingArguments):
return {
"model.pt": training_pipeline_arguments.model_path,
"config.pt": training_pipeline_arguments.config_path,
"vocab.pt": training_pipeline_arguments.vocab_path,
}
else:
return super().get_filepath_mappings_for_training_pipeline_arguments(
training_pipeline_arguments
)