import re
import numpy as np
import networkx as nx
from cctk import File, Ensemble, ConformationalEnsemble, Molecule
from cctk.helper_functions import get_symbol, get_number
[docs]
class MOL2File(File):
"""
Class representing SYBYL ``.mol2`` files.
Attributes:
name (str): name of file
ensemble (Ensemble): ``Ensemble`` or ``ConformationalEnsemble`` object
"""
[docs]
def __init__(self, name=None):
if isinstance(name, str):
self.name = name
[docs]
@classmethod
def read_file(cls, filename, name=None, **kwargs):
"""
Reads ``.mol2`` file and generates a ``MOL2File`` instance.
Args:
filename (str): path to file
name (str): name of the file
Returns:
MOL2File object
"""
file = MOL2File(name=name)
(geometries, all_clean_symbols, all_symbols, all_bonds, conformers) = cls._read_mol2(filename, **kwargs)
assert len(all_bonds) == len(geometries)
for bonds in all_bonds:
assert isinstance(bonds, nx.Graph)
assert len(bonds) == len(geometries[0])
if conformers:
# convert atom types to atomic numbers
atomic_numbers = []
for atom_type in all_symbols[0]:
assert isinstance(atom_type,str), f"unexpected atom_type type: {type(atom_type)} / {atom_type}"
fields = atom_type.split(".")
symbol = fields[0]
symbol = re.sub("[^A-Za-z]","",symbol)
atomic_number = get_number(symbol)
atomic_numbers.append(atomic_number)
atomic_numbers = np.asarray(atomic_numbers, dtype=np.int8)
# create ensemble
file.ensemble = ConformationalEnsemble()
for geometry in geometries:
molecule = Molecule(atomic_numbers, geometry, bonds=all_bonds[0].edges, checks=False)
file.ensemble.add_molecule(molecule, checks=False)
else:
file.ensemble = Ensemble()
for this_symbols,geometry in zip(all_symbols,geometries):
atomic_numbers=[]
for atom_type in this_symbols:
assert isinstance(atom_type,str), f"unexpected atom_type type: {type(atom_type)} / {atom_type}"
fields = atom_type.split(".")
symbol = fields[0]
symbol = re.sub("[^A-Za-z]","",symbol)
atomic_number = get_number(symbol)
atomic_numbers.append(atomic_number)
atomic_numbers = np.asarray(atomic_numbers, dtype=np.int8)
molecule = Molecule(atomic_numbers, geometry, bonds=bonds.edges)
file.ensemble.add_molecule(molecule)
return file
@classmethod
def _read_mol2(
cls, filename, contains_conformers="check", save_memory_for_conformers=True, print_status_messages=False,
):
"""
Reads .mol2 files into cctk.
Args:
filename str): the name of the .mol2 file
contains_conformers('check' or bool): if set to 'check', multiple geometries
in the same file will be compared to see
if they are conformers. Alternatively,
force the geometries to be treated as
conformers (True) or not (False). This
latter option increases performance,
particularly for large files.
print_status_messages (bool): if True, update the progerss of the parsing operation to stdout.
Returns:
all_geometries, all_clean_symbols, all_symbols, all_bonds, contains_conformers
all_geometries: np.ndarray(geometry number, atom number, xyz) -> position (float)
all_clean_symbols: np.ndarray(geometry number, atom number) -> atom symbol (:obj:`str`)
all_symbols: np.ndarray(geometry number, atom number) -> atom symbol (:obj:`str`)
all_bonds: list(geometry_number) -> bond connectivity (:obj:`nx.Graph`)
contains_conformers: bool (True if the geometries correspond to conformers.)
"""
# read file
if print_status_messages:
print(f"Reading {filename}...", end="", flush=True)
lines = super().read_file(filename)
if print_status_messages:
print(f"read {len(lines)} lines...", end="", flush=True)
# initialize arrays
all_geometries = []
all_symbols = []
all_clean_symbols = []
all_bonds = []
this_geometry = []
this_symbols = []
this_clean_symbols = []
this_bonds = None
# parse file
i = 0
in_geometry_block = False
in_bond_block = False
bond_number = 0
while i < len(lines):
# read the current line
line = lines[i]
# determine if we are in a geometry block
if line.startswith("@<TRIPOS>ATOM"):
# step forward to the first geometry line
in_geometry_block = True
in_bond_block = False
i += 1
line = lines[i]
if contains_conformers is True and len(all_symbols) > 0:
this_symbols = all_symbols[0]
this_clean_symbols = all_clean_symbols[0]
elif line.startswith("@<TRIPOS>BOND"):
# update status
in_geometry_block = False
in_bond_block = True
bond_number = 0
# get next line
i += 1
line = lines[i]
# initialize connectivity graph
if len(this_geometry) == 0:
raise ValueError("got to bond table without a geometry")
if contains_conformers is True and len(all_bonds) > 0:
this_bonds = all_bonds[0]
else:
this_bonds = nx.Graph()
this_bonds.add_nodes_from(range(1, len(this_geometry) + 1))
# parse geometry if appropriate
if in_geometry_block:
fields = line.split()
if len(fields) < 6:
print("Error parsing file:")
print("Line = '%s'" % line.strip())
print(fields)
break
x, y, z = float(fields[2]), float(fields[3]), float(fields[4])
this_geometry.append([x, y, z])
if contains_conformers is not True or len(all_symbols)==0:
symbol = fields[5]
clean_symbol = fields[1]
this_symbols.append(symbol)
this_clean_symbols.append(clean_symbol)
elif in_bond_block:
fields = line.split()
if len(fields) == 4 and (len(all_bonds)==0 or contains_conformers is not True):
# parse bonds, checking that the bonds are increasing
try:
this_bond_number = int(fields[0])
atom1 = int(fields[1])
atom2 = int(fields[2])
n_atoms = len(this_geometry)
if not 1 <= atom1 <= n_atoms or not 1 <= atom2 <= n_atoms:
raise ValueError(f"atom number out of range: {line}")
if fields[3] == "ar":
bond_order = 1
else:
bond_order = int(fields[3])
if bond_order <= 0:
raise ValueError(f"zero or negative bond order: {line}")
if this_bond_number != bond_number + 1:
raise ValueError("non-sequential bond number")
bond_number = this_bond_number
if this_bonds.has_edge(atom1, atom2):
current_bond_order = this_bonds[atom1][atom2]["weight"]
if current_bond_order != bond_order:
raise ValueError(f"inconsistent bond order definition: {line}")
this_bonds.add_edge(atom1, atom2, weight=bond_order)
this_bonds.add_edge(atom2, atom1, weight=bond_order)
except Exception:
# assume we have left the bond block
in_geometry_block = False
in_bond_block = False
else:
# we have left the bond block
in_geometry_block = False
in_bond_block = False
# go to next line
i += 1
# store geometry and reinitialize if appropriate
end_of_file = i == len(lines)
end_of_blocks = not in_geometry_block and not in_bond_block
if (end_of_file or end_of_blocks) and len(this_geometry) > 0:
all_geometries.append(this_geometry)
all_clean_symbols.append(this_clean_symbols)
all_symbols.append(this_symbols)
all_bonds.append(this_bonds)
this_geometry = []
this_symbols = []
this_clean_symbols = []
this_bonds = None
# convert to numpy array
all_geometries = np.array(all_geometries)
all_symbols = np.array(all_symbols)
all_clean_symbols = np.array(all_clean_symbols)
# determine if these are conformers
if contains_conformers == "check":
contains_conformers = True
for symbols, bonds in zip(all_symbols[1:], all_bonds[1:]):
# must have the same symbols and bonds
if not (all_symbols[0] == symbols).all() or not nx.is_isomorphic(all_bonds[0], bonds):
contains_conformers = False
break
elif isinstance(contains_conformers, bool):
pass
else:
raise ValueError("contains_conformers must be 'check' or boolean")
# return result
n_geometries = len(all_geometries)
if print_status_messages:
if n_geometries > 1:
if contains_conformers:
n_atoms = len(all_geometries[0])
n_bonds = all_bonds[0].number_of_edges()
if print_status_messages:
print(f"read {n_geometries} conformers ({n_atoms} atoms and {n_bonds} bonds).")
else:
min_n_atoms = len(all_geometries[0])
max_n_atoms = len(all_geometries[0])
for geometry in all_geometries[1:]:
if len(geometry) > max_n_atoms:
max_n_atoms = len(geometry)
elif len(geometry) < min_n_atoms:
min_n_atoms = len(geometry)
min_n_bonds = all_bonds[0].number_of_edges()
max_n_bonds = all_bonds[0].number_of_edges()
for bonds in all_bonds[1:]:
if bonds.number_of_edges() > max_n_bonds:
max_n_bonds = bonds.number_of_edges()
elif bonds.number_of_edges() < min_n_bonds:
min_n_bonds = bonds.number_of_edges
if print_status_messages:
print(f"read {n_geometries} unrelated geometries ({min_n_atoms}-{max_n_atoms} atoms and {min_n_bonds}-{max_n_bonds}) bonds).")
else:
n_atoms = len(all_geometries)
n_bonds = all_bonds[0].number_of_edges()
if print_status_messages:
print(f"read one geometry ({n_atoms} atoms and {n_bonds} bonds).")
return (all_geometries, all_clean_symbols, all_symbols, all_bonds, contains_conformers)
[docs]
def get_molecule(self, num=None):
"""
Returns the last molecule from the ensemble.
If ``num`` is specified, returns ``self.ensemble.molecules[num]``
"""
# some methods pass num=None, which overrides setting the default above
if num is None:
num = -1
if not isinstance(num, int):
raise TypeError("num must be int")
return self.ensemble.molecules[num]
[docs]
@classmethod
def write_molecule_to_file(cls, filename, molecule, title=None, append=False):
"""
Write a ``.gjf`` file using the given molecule.
Args:
filename (str): path to the new file
molecule (Molecule): which molecule to use -- a``Molecule`` object.
title (str): title of the file
append (Bool): whether to write to file normally or append
"""
assert isinstance(molecule, Molecule), "molecule is not a valid Molecule object!"
text = f"# {title}\n#\n#\n\n#\n#\n\n"
text += f"@<TRIPOS>MOLECULE\n{title}\n{molecule.num_atoms()} {molecule.bonds.number_of_edges()}\nSMALL\nNO_CHARGES\n\n\n"
text += "@<TRIPOS>ATOM\n"
for idx, z in enumerate(molecule.atomic_numbers, start=1):
v = molecule.get_vector(idx)
text += f"{idx} {get_symbol(z)}{idx} {v[0]: .4f} {v[1]: .4f} {v[2]: .4f} {get_symbol(z)} 0\n"
text += "@<TRIPOS>BOND\n"
count = 1
for atom1, atom2, weight in molecule.bonds.edges.data("weight", default=1):
text += f"{count} {atom1} {atom2} {weight}\n"
count += 1
if append:
super().append_to_file(filename, text)
else:
super().write_file(filename, text)
[docs]
def write_file(self, filename, molecule=-1, **kwargs):
"""
Write a ``.mol2`` file, using object attributes.
Args:
filename (str): path to the new file
molecule (int): which molecule to use -- passed to ``self.get_molecule()``.
Default is -1 (e.g. the last molecule), but positive integers will select from self.ensemble.molecules (0-indexed).
A ``Molecule`` object can also be passed, in which case that molecule will be written to the file.
"""
if molecule is None or isinstance(molecule, (np.integer, int)):
molecule = self.ensemble.molecules[molecule]
self.write_molecule_to_file(filename, molecule, **kwargs)
[docs]
@classmethod
def write_ensemble_to_file(cls, filename, ensemble):
"""
Write each structure in the specified ensemble to a single mol2 file.
Args:
filename (str): where to write the file
ensemble (Ensemble): ``Ensemble`` object to write
"""
for idx, molecule in enumerate(ensemble.molecules):
if idx == 0:
cls.write_molecule_to_file(filename, molecule, append=False)
else:
cls.write_molecule_to_file(filename, molecule, append=True)