Source code for gt4sd.frameworks.granular.tokenizer.tokenizer

#
# MIT License
#
# Copyright (c) 2022 GT4SD team
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
"""Tokenizers implementations."""

import collections
import logging
import os
from typing import Dict, Iterable, List, Type

import regex as re
import selfies as sf
from pytoda.smiles.processing import tokenize_selfies

SMI_REGEX_PATTERN = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
BIG_SMI_REGEX_PATTERN = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|\,|\{|\}|\[\]|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


[docs]def selfies_alphabet() -> List[str]: """Legacy selfies 0.2.4 alphabet method. Adapted from: https://github.com/aspuru-guzik-group/selfies/blob/84122855ae76a928e1cb7d58796b8b47385a4359/selfies/selfies.py#L4. Returns: SELFIES list of tokens. """ alphabet = [ "[Branch1_1]", "[Branch1_2]", "[Branch1_3]", "[Ring1]", "[Branch2_1]", "[Branch2_2]", "[Branch2_3]", "[Ring2]", "[Branch3_1]", "[Branch3_2]", "[Branch3_3]", "[Ring3]", "[O]", "[=O]", "[N]", "[=N]", "[C]", "[=C]", "[#C]", "[S]", "[=S]", "[P]", "[F]", "[C@Hexpl]", "[C@@Hexpl]", "[C@expl]", "[C@@expl]", "[H]", "[NHexpl]", ] return alphabet
[docs]def load_vocab(vocab_file: str) -> Dict[str, int]: """Loads a vocabulary file into a dictionary. Args: vocab_file: vocabulary file. Returns: vocabulary mapping tokens to indices. """ vocab = collections.OrderedDict() with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): token = token.rstrip("\n") vocab[token] = index return vocab
[docs]class BasicTokenizer: """Basic tokenizer."""
[docs] def __init__( self, pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a BasicSmilesTokenizer. Args: pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ self.pad_token = pad_token self.sos_token = sos_token self.eos_token = eos_token self.unk_token = unk_token
[docs] def tokenize(self, text: str) -> List[str]: """Tokenize input text. Args: text: text to tokenize. Returns: list of tokens. """ return list(text)
[docs] def build_vocab(self, smiles: Iterable[str], vocab_file: str) -> List[str]: """Build and save a vocabulary given a SMILES list. Args: smiles: iterable of SMILES. vocab_file: path to a file where the vocabulary is saved. Returns: a list of all tokens in the vocabulary. """ tokens = set([self.pad_token, self.sos_token, self.eos_token, self.unk_token]) for smile in smiles: tokens_temp = self.tokenize(smile) for token in tokens_temp: tokens.add(token) tokens_list = sorted(list(tokens)) with open(vocab_file, "w") as f: for item in tokens_list: f.write(f"{item}{os.linesep}") return tokens_list
[docs]class BasicSmilesTokenizer(BasicTokenizer): """Basic SMILES tokenizer."""
[docs] def __init__( self, regex_pattern: str = SMI_REGEX_PATTERN, pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a BasicSmilesTokenizer. Args: regex_pattern: regex pattern. Defaults to SMI_REGEX_PATTERN. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ self.regex_pattern = regex_pattern self.regex = re.compile(self.regex_pattern) super().__init__( pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, )
[docs] def tokenize(self, text: str) -> List[str]: """Tokenize input text. Args: text: text to tokenize. Returns: list of tokens. """ return [token for token in self.regex.findall(text)]
[docs]class BasicSelfiesTokenizer(BasicTokenizer): """Basic SELFIES tokenizer."""
[docs] def __init__( self, pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a BasicSelfiesTokenizer. Args: pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ self.pad_token = pad_token self.sos_token = sos_token self.eos_token = eos_token self.unk_token = unk_token
[docs] def smiles_to_selfies(self, smiles: Iterable[str]) -> List[str]: """Convert a list of SMILES into SELFIES. Args: smiles: a list of SMILES. Returns: a list of SELFIES. """ return [sf.encoder(a_smiles) for a_smiles in smiles]
[docs] def tokenize(self, text: str) -> List[str]: """Tokenize input text. Args: text: text to tokenize. Returns: list of tokens. """ return tokenize_selfies(sf.encoder(text))
[docs] def build_vocab(self, smiles: Iterable[str], vocab_file: str) -> List[str]: """Build and save a vocabulary given a SMILES list. Args: smiles: iterable of SMILES. vocab_file: path to a file where the vocabulary is saved. Returns: a list of all tokens in the vocabulary. """ selfies = self.smiles_to_selfies(smiles) tokens = set( [self.pad_token, self.sos_token, self.eos_token, self.unk_token, "[.]"] + selfies_alphabet() ) for a_selfies in selfies: tokens = tokens | set(tokenize_selfies(a_selfies)) tokens_list = sorted(list(tokens)) with open(vocab_file, "w") as f: for item in tokens_list: f.write(f"{item}{os.linesep}") return tokens_list
[docs]class Tokenizer: """Tokenizer that can build a vocabulary on the fly."""
[docs] def __init__( self, vocab_file: str, basic_tokenizer: BasicTokenizer = BasicTokenizer(), smiles: List[str] = [], pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a Tokenizer. Args: vocab_file: path to vocabulary file. If the file is not present, the provided SMILES list is used to generate one. basic_tokenizer: a basic tokenizer. Defaults to BasicTokenizer character tokenizer. smiles: list of smiles. Default to empty list, used only if the vocabulary file does not exist. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ self.basic_tokenizer = basic_tokenizer self.pad_token = pad_token self.sos_token = sos_token self.eos_token = eos_token self.unk_token = unk_token # load or build vocab if os.path.isfile(vocab_file) and len(smiles) == 0: logger.info(f"load vocab from: {vocab_file}") self.vocab = load_vocab(vocab_file) else: logger.info("build tokenizer and vocabulary") self.basic_tokenizer.build_vocab(smiles, vocab_file) logger.info(f"saved vocabulary: {vocab_file}") self.vocab = load_vocab(vocab_file) self.vocab_ids = {token: index for token, index in self.vocab.items()} self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()] ) self.pad_token_id = self.vocab.get(pad_token, self.vocab[self.unk_token]) self.sos_token_id = self.vocab.get(sos_token, self.vocab[self.unk_token])
@property def vocab_size(self) -> int: """Size of the vocabulary. Returns: vocabulary file. """ return len(self.vocab) @property def vocab_list(self) -> List[str]: """Return vocabulary tokens. Returns: all tokens from the vocabulary. """ return list(self.vocab.keys())
[docs] def tokenize(self, text: str) -> List[str]: """Tokenize a given text. Args: text: text to tokenize. Returns: list of tokens. """ return [token for token in self.basic_tokenizer.tokenize(text)]
[docs] def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: """Convert tokens to indices. Args: tokens: list of tokens. Returns: list of indices. """ ids = [] for token in tokens: ids.append(self.convert_token_to_id(token)) return ids
[docs] def convert_token_to_id(self, token: str) -> int: """Convert token to index. Args: token: a token. Returns: index corresponding to the input token. Unknown token index if the input token is not present in the vocabulary. """ return self.vocab.get(token, self.vocab[self.unk_token])
[docs] def convert_id_to_token(self, index: int) -> str: """Convert index to token. Args: index: an index. Returns: token corresponding to the input index. Unknown token if the input index is not found. """ return self.ids_to_tokens.get(index, self.unk_token)
[docs] def add_padding_tokens( self, token_ids: List[int], length: int, right: bool = True ) -> List[int]: """Add padding token indices to the provided token indices. Args: token_ids: token indices. length: length of the sequence. right: wheter the padding is performed on the right. Defaults to True, if False the padding happens on the left. Returns: the padded sequence. """ padding = [self.pad_token_id] * (length - len(token_ids)) if right: return token_ids + padding else: return padding + token_ids
[docs]class GenericTokenizer(Tokenizer): """Generic tokenizer that can build a vocabulary on the fly."""
[docs] def __init__( self, vocab_file: str, smiles: List[str] = [], pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a GenericTokenizer. Args: vocab_file: path to vocabulary file. If the file is not present, the provided SMILES list is used to generate one. smiles: list of smiles. Default to empty list, used only if the vocabulary file does not exist. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ super().__init__( vocab_file=vocab_file, basic_tokenizer=BasicTokenizer( pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, ), smiles=smiles, pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, )
[docs]class SmilesTokenizer(Tokenizer): """SMILES tokenizer that can build a vocabulary on the fly."""
[docs] def __init__( self, vocab_file: str, smiles: List[str] = [], pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a SmilesTokenizer. Args: vocab_file: path to vocabulary file. If the file is not present, the provided SMILES list is used to generate one. smiles: list of smiles. Default to empty list, used only if the vocabulary file does not exist. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ super().__init__( vocab_file=vocab_file, basic_tokenizer=BasicSmilesTokenizer( pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, ), smiles=smiles, pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, )
[docs]class BigSmilesTokenizer(Tokenizer): """Big-SMILES tokenizer that can build a vocabulary on the fly."""
[docs] def __init__( self, vocab_file: str, smiles: List[str] = [], pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a BigSmilesTokenizer. Args: vocab_file: path to vocabulary file. If the file is not present, the provided Big-SMILES list is used to generate one. smiles: list of big smiles. Default to empty list, used only if the vocabulary file does not exist. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ super().__init__( vocab_file=vocab_file, basic_tokenizer=BasicSmilesTokenizer( regex_pattern=BIG_SMI_REGEX_PATTERN, pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, ), smiles=smiles, pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, )
[docs]class SelfiesTokenizer(Tokenizer): """SELFIES tokenizer that can build a vocabulary on the fly."""
[docs] def __init__( self, vocab_file: str, smiles: List[str] = [], pad_token: str = "<pad>", sos_token: str = "<sos>", eos_token: str = "</s>", unk_token: str = "<unk>", ) -> None: """Constructs a SelfiesTokenizer. Args: vocab_file: path to vocabulary file. If the file is not present, the provided SMILES list is used to generate one. smiles: list of smiles. Default to empty list, used only if the vocabulary file does not exist. pad_token: padding token. Defaults to '<pad>'. sos_token: start of sequence token. Defaults to '<sos>'. eos_token: end of sequence token. Defaults to '</s>'. unk_token: unknown token. Defaults to '<unk>'. """ super().__init__( vocab_file=vocab_file, basic_tokenizer=BasicSelfiesTokenizer( pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, ), smiles=smiles, pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token, )
TOKENIZER_FACTORY: Dict[str, Type[Tokenizer]] = { "generic": GenericTokenizer, "smiles": SmilesTokenizer, "big-smiles": BigSmilesTokenizer, "selfies": SelfiesTokenizer, }