Source code for cctk.mol2_file

import re
import numpy as np
import networkx as nx

from cctk import File, Ensemble, ConformationalEnsemble, Molecule
from cctk.helper_functions import get_symbol, get_number



[docs]
class MOL2File(File):
    """
    Class representing SYBYL ``.mol2`` files.

    Attributes:
        name (str): name of file
        ensemble (Ensemble): ``Ensemble`` or ``ConformationalEnsemble`` object
    """


[docs]
    def __init__(self, name=None):
        if isinstance(name, str):
            self.name = name



[docs]
    @classmethod
    def read_file(cls, filename, name=None, **kwargs):
        """
        Reads ``.mol2`` file and generates a ``MOL2File`` instance.

        Args:
            filename (str): path to file
            name (str): name of the file

        Returns:
            MOL2File object
        """

        file = MOL2File(name=name)

        (geometries, all_clean_symbols, all_symbols, all_bonds, conformers) = cls._read_mol2(filename, **kwargs)
        assert len(all_bonds) == len(geometries)
        for bonds in all_bonds:
            assert isinstance(bonds, nx.Graph)
            assert len(bonds) == len(geometries[0])

        if conformers:
            # convert atom types to atomic numbers
            atomic_numbers = []
            for atom_type in all_symbols[0]:
                assert isinstance(atom_type,str), f"unexpected atom_type type: {type(atom_type)} / {atom_type}"
                fields = atom_type.split(".")
                symbol = fields[0]
                symbol = re.sub("[^A-Za-z]","",symbol)
                atomic_number = get_number(symbol)
                atomic_numbers.append(atomic_number)
            atomic_numbers = np.asarray(atomic_numbers, dtype=np.int8)

            # create ensemble
            file.ensemble = ConformationalEnsemble()
            for geometry in geometries:
                molecule = Molecule(atomic_numbers, geometry, bonds=all_bonds[0].edges, checks=False)
                file.ensemble.add_molecule(molecule, checks=False)
        else:
            file.ensemble = Ensemble()
            for this_symbols,geometry in zip(all_symbols,geometries):
                atomic_numbers=[]
                for atom_type in this_symbols:
                    assert isinstance(atom_type,str), f"unexpected atom_type type: {type(atom_type)} / {atom_type}"
                    fields = atom_type.split(".")
                    symbol = fields[0]
                    symbol = re.sub("[^A-Za-z]","",symbol)
                    atomic_number = get_number(symbol)
                    atomic_numbers.append(atomic_number)
                atomic_numbers = np.asarray(atomic_numbers, dtype=np.int8)
                molecule = Molecule(atomic_numbers, geometry, bonds=bonds.edges)
                file.ensemble.add_molecule(molecule)

        return file


    @classmethod
    def _read_mol2(
        cls, filename, contains_conformers="check", save_memory_for_conformers=True, print_status_messages=False,
    ):
        """
        Reads .mol2 files into cctk.

        Args:
            filename str): the name of the .mol2 file

            contains_conformers('check' or bool): if set to 'check', multiple geometries
                                                in the same file will be compared to see
                                                if they are conformers.  Alternatively,
                                                force the geometries to be treated as
                                                conformers (True) or not (False).  This
                                                latter option increases performance,
                                                particularly for large files.

            print_status_messages (bool): if True, update the progerss of the parsing operation to stdout.

        Returns:
            all_geometries, all_clean_symbols, all_symbols, all_bonds, contains_conformers

            all_geometries: np.ndarray(geometry number, atom number, xyz) -> position (float)
            all_clean_symbols: np.ndarray(geometry number, atom number) -> atom symbol (:obj:`str`)
            all_symbols: np.ndarray(geometry number, atom number) -> atom symbol (:obj:`str`)
            all_bonds: list(geometry_number) -> bond connectivity (:obj:`nx.Graph`)
            contains_conformers: bool (True if the geometries correspond to conformers.)
        """
        # read file
        if print_status_messages:
            print(f"Reading {filename}...", end="", flush=True)
        lines = super().read_file(filename)
        if print_status_messages:
            print(f"read {len(lines)} lines...", end="", flush=True)

        # initialize arrays
        all_geometries = []
        all_symbols = []
        all_clean_symbols = []
        all_bonds = []
        this_geometry = []
        this_symbols = []
        this_clean_symbols = []
        this_bonds = None

        # parse file
        i = 0
        in_geometry_block = False
        in_bond_block = False
        bond_number = 0
        while i < len(lines):
            # read the current line
            line = lines[i]

            # determine if we are in a geometry block
            if line.startswith("@<TRIPOS>ATOM"):
                # step forward to the first geometry line
                in_geometry_block = True
                in_bond_block = False
                i += 1
                line = lines[i]
                if contains_conformers is True and len(all_symbols) > 0:
                    this_symbols = all_symbols[0]
                    this_clean_symbols = all_clean_symbols[0]
            elif line.startswith("@<TRIPOS>BOND"):
                # update status
                in_geometry_block = False
                in_bond_block = True
                bond_number = 0

                # get next line
                i += 1
                line = lines[i]

                # initialize connectivity graph
                if len(this_geometry) == 0:
                    raise ValueError("got to bond table without a geometry")
                if contains_conformers is True and len(all_bonds) > 0:
                    this_bonds = all_bonds[0]
                else:
                    this_bonds = nx.Graph()
                    this_bonds.add_nodes_from(range(1, len(this_geometry) + 1))

            # parse geometry if appropriate
            if in_geometry_block:
                fields = line.split()
                if len(fields) < 6:
                    print("Error parsing file:")
                    print("Line = '%s'" % line.strip())
                    print(fields)
                    break
                x, y, z = float(fields[2]), float(fields[3]), float(fields[4])
                this_geometry.append([x, y, z])
                if contains_conformers is not True or len(all_symbols)==0:
                    symbol = fields[5]
                    clean_symbol = fields[1]
                    this_symbols.append(symbol)
                    this_clean_symbols.append(clean_symbol)
            elif in_bond_block:
                fields = line.split()
                if len(fields) == 4 and (len(all_bonds)==0 or contains_conformers is not True):
                    # parse bonds, checking that the bonds are increasing
                    try:
                        this_bond_number = int(fields[0])
                        atom1 = int(fields[1])
                        atom2 = int(fields[2])
                        n_atoms = len(this_geometry)
                        if not 1 <= atom1 <= n_atoms or not 1 <= atom2 <= n_atoms:
                            raise ValueError(f"atom number out of range: {line}")
                        if fields[3] == "ar":
                            bond_order = 1
                        else:
                            bond_order = int(fields[3])
                        if bond_order <= 0:
                            raise ValueError(f"zero or negative bond order: {line}")
                        if this_bond_number != bond_number + 1:
                            raise ValueError("non-sequential bond number")
                        bond_number = this_bond_number
                        if this_bonds.has_edge(atom1, atom2):
                            current_bond_order = this_bonds[atom1][atom2]["weight"]
                            if current_bond_order != bond_order:
                                raise ValueError(f"inconsistent bond order definition: {line}")
                        this_bonds.add_edge(atom1, atom2, weight=bond_order)
                        this_bonds.add_edge(atom2, atom1, weight=bond_order)
                    except Exception:
                        # assume we have left the bond block
                        in_geometry_block = False
                        in_bond_block = False
                else:
                    # we have left the bond block
                    in_geometry_block = False
                    in_bond_block = False

            # go to next line
            i += 1

            # store geometry and reinitialize if appropriate
            end_of_file = i == len(lines)
            end_of_blocks = not in_geometry_block and not in_bond_block
            if (end_of_file or end_of_blocks) and len(this_geometry) > 0:
                all_geometries.append(this_geometry)
                all_clean_symbols.append(this_clean_symbols)
                all_symbols.append(this_symbols)
                all_bonds.append(this_bonds)
                this_geometry = []
                this_symbols = []
                this_clean_symbols = []
                this_bonds = None

        # convert to numpy array
        all_geometries = np.array(all_geometries)
        all_symbols = np.array(all_symbols)
        all_clean_symbols = np.array(all_clean_symbols)

        # determine if these are conformers
        if contains_conformers == "check":
            contains_conformers = True
            for symbols, bonds in zip(all_symbols[1:], all_bonds[1:]):
                # must have the same symbols and bonds
                if not (all_symbols[0] == symbols).all() or not nx.is_isomorphic(all_bonds[0], bonds):
                    contains_conformers = False
                    break
        elif isinstance(contains_conformers, bool):
            pass
        else:
            raise ValueError("contains_conformers must be 'check' or boolean")

        # return result
        n_geometries = len(all_geometries)
        if print_status_messages:
            if n_geometries > 1:
                if contains_conformers:
                    n_atoms = len(all_geometries[0])
                    n_bonds = all_bonds[0].number_of_edges()
                    if print_status_messages:
                        print(f"read {n_geometries} conformers ({n_atoms} atoms and {n_bonds} bonds).")
                else:
                    min_n_atoms = len(all_geometries[0])
                    max_n_atoms = len(all_geometries[0])
                    for geometry in all_geometries[1:]:
                        if len(geometry) > max_n_atoms:
                            max_n_atoms = len(geometry)
                        elif len(geometry) < min_n_atoms:
                            min_n_atoms = len(geometry)
                    min_n_bonds = all_bonds[0].number_of_edges()
                    max_n_bonds = all_bonds[0].number_of_edges()
                    for bonds in all_bonds[1:]:
                        if bonds.number_of_edges() > max_n_bonds:
                            max_n_bonds = bonds.number_of_edges()
                        elif bonds.number_of_edges() < min_n_bonds:
                            min_n_bonds = bonds.number_of_edges
                    if print_status_messages:
                        print(f"read {n_geometries} unrelated geometries ({min_n_atoms}-{max_n_atoms} atoms and {min_n_bonds}-{max_n_bonds}) bonds).")
            else:
                n_atoms = len(all_geometries)
                n_bonds = all_bonds[0].number_of_edges()
                if print_status_messages:
                    print(f"read one geometry ({n_atoms} atoms and {n_bonds} bonds).")

        return (all_geometries, all_clean_symbols, all_symbols, all_bonds, contains_conformers)


[docs]
    def get_molecule(self, num=None):
        """
        Returns the last molecule from the ensemble.

        If ``num`` is specified, returns ``self.ensemble.molecules[num]``
        """
        # some methods pass num=None, which overrides setting the default above
        if num is None:
            num = -1

        if not isinstance(num, int):
            raise TypeError("num must be int")

        return self.ensemble.molecules[num]



[docs]
    @classmethod
    def write_molecule_to_file(cls, filename, molecule, title=None, append=False):
        """
        Write a ``.gjf`` file using the given molecule.

        Args:
            filename (str): path to the new file
            molecule (Molecule): which molecule to use -- a``Molecule`` object.
            title (str): title of the file
            append (Bool): whether to write to file normally or append
        """
        assert isinstance(molecule, Molecule), "molecule is not a valid Molecule object!"

        text = f"# {title}\n#\n#\n\n#\n#\n\n"
        text += f"@<TRIPOS>MOLECULE\n{title}\n{molecule.num_atoms()} {molecule.bonds.number_of_edges()}\nSMALL\nNO_CHARGES\n\n\n"
        text += "@<TRIPOS>ATOM\n"
        for idx, z in enumerate(molecule.atomic_numbers, start=1):
            v = molecule.get_vector(idx)
            text += f"{idx} {get_symbol(z)}{idx}    {v[0]: .4f}    {v[1]: .4f}    {v[2]: .4f} {get_symbol(z)} 0\n"
        text += "@<TRIPOS>BOND\n"
        count = 1
        for atom1, atom2, weight in molecule.bonds.edges.data("weight", default=1):
            text += f"{count} {atom1} {atom2} {weight}\n"
            count += 1

        if append:
            super().append_to_file(filename, text)
        else:
            super().write_file(filename, text)



[docs]
    def write_file(self, filename, molecule=-1, **kwargs):
        """
        Write a ``.mol2`` file, using object attributes.

        Args:
            filename (str): path to the new file
            molecule (int): which molecule to use -- passed to ``self.get_molecule()``.
                Default is -1 (e.g. the last molecule), but positive integers will select from self.ensemble.molecules (0-indexed).
                A ``Molecule`` object can also be passed, in which case that molecule will be written to the file.
        """
        if molecule is None or isinstance(molecule, (np.integer, int)):
            molecule = self.ensemble.molecules[molecule]
        self.write_molecule_to_file(filename, molecule, **kwargs)



[docs]
    @classmethod
    def write_ensemble_to_file(cls, filename, ensemble):
        """
        Write each structure in the specified ensemble to a single mol2 file.

        Args:
            filename (str): where to write the file
            ensemble (Ensemble): ``Ensemble`` object to write
        """
        for idx, molecule in enumerate(ensemble.molecules):
            if idx == 0:
                cls.write_molecule_to_file(filename, molecule, append=False)
            else:
                cls.write_molecule_to_file(filename, molecule, append=True)