import re
import numpy as np
import networkx as nx
from cctk import File, Ensemble, ConformationalEnsemble, Molecule
from cctk.helper_functions import get_number
[docs]
class MAEFile(File):
"""
Class representing Maestro ``.mae`` files.
Attributes:
name (str): name of file
ensemble (Ensemble): ``Ensemble`` or ``ConformationalEnsemble`` object
"""
[docs]
def __init__(self, name=None):
if isinstance(name, str):
self.name = name
[docs]
@classmethod
def read_file(cls, filename, name=None, **kwargs):
"""
Reads ``.mae`` file and generates a ``MAEFile`` instance.
Args:
filename (str): path to file
name (str): name of the file
Returns:
MAEFile object
property names (list)
property_values (list)
"""
file = MAEFile(name=name)
(geometries, symbols, bonds, p_names, p_vals, conformers) = cls._read_mae(filename, **kwargs)
atomic_numbers = np.array([get_number(z) for z in symbols], dtype=np.int8)
if conformers is True:
file.ensemble = ConformationalEnsemble()
else:
file.ensemble = Ensemble()
for geom in geometries:
file.ensemble.add_molecule(Molecule(atomic_numbers, geom, bonds=bonds.edges))
return file, p_names, p_vals
@classmethod
def _read_mae(
cls, filename, contains_conformers="check", save_memory_for_conformers=True, print_status_messages=False,
):
"""
Reads uncompressed Macromodel files.
Args:
filename (str): path to file
contains_conformers (str): one of ``check``, ``True``, or ``False``
save_memory_for_conformers (Bool):
print_status_messages (Bool):
Returns:
geometries (np.ndarray): array of 3-tuples of geometries
symbols (np.ndarray): array of atom symbols (str)
bonds (nx.Graph): ``NetworkX`` graph of bond information
property_names:
property_values:
contains_conformers (Bool): whether or not the file contains conformers
"""
# read file
if print_status_messages:
print(f"Reading {filename}...", end="", flush=True)
lines = super().read_file(filename)
if print_status_messages:
print(f"read {len(lines)} lines...", end="", flush=True)
# initialize arrays
geometries = []
symbols = []
bonds = []
property_names = []
property_values = []
this_geometry = None
this_symbols = None
this_bonds = None
this_property_names = None
this_property_values = None
# parse file
i = 0
current_block_type = None
while i < len(lines):
# read the current line
line = lines[i]
i += 1
# determine if we are in a molecule block
end_of_file = i + 1 == len(lines)
if current_block_type is None and (line.startswith("f_m_ct") or end_of_file):
# store the current results if any
if this_geometry is not None and len(this_geometry) > 0:
geometries.append(this_geometry)
symbols.append(this_symbols)
bonds.append(this_bonds)
property_names.append(this_property_names)
property_values.append(this_property_values)
# prepare to read a new molecule
current_block_type = "property_names"
this_geometry = []
this_symbols = []
this_bonds = None
this_property_names = []
this_property_values = []
continue
# read property names
elif current_block_type == "property_names":
line = line.strip()
if line.startswith("i_m_ct_format"):
next_line = lines[i].strip()
if next_line != ":::":
raise ValueError(f"expected ':::' here but line {i+1} is:\n{next_line}\n")
current_block_type = "property_values"
i += 1
elif line.startswith(":::"):
raise ValueError(f"expected to see i_m_ct_format as the last property (line {i+1})")
else:
fields = re.split(" +", line)
if len(fields) != 1:
raise ValueError(f"unexpected number of fields in property name line: {line}")
this_property_names.append(line)
# read property values
elif current_block_type == "property_values":
n_properties = len(this_property_names)
for j in range(n_properties):
this_property_values.append(lines[i + j])
i += n_properties
current_block_type = "looking_for_geometry1"
# look for geometry block
elif current_block_type == "looking_for_geometry1":
if line.startswith(" m_atom"):
current_block_type = "looking_for_geometry2"
elif current_block_type == "looking_for_geometry2":
if line.strip() == ":::":
current_block_type = "geometry_block"
# parse geometry
elif current_block_type == "geometry_block":
line = line.strip()
if line == ":::":
current_block_type = "bond_block"
# initialize bond connectivity graph
this_bonds = nx.Graph()
n_atoms = len(this_symbols)
this_bonds.add_nodes_from(range(1, n_atoms + 1))
i += 7
else:
fields = re.split(" +", line)
x, y, z = float(fields[2]), float(fields[3]), float(fields[4])
this_geometry.append((x, y, z))
symbol = fields[-1]
this_symbols.append(symbol)
# parse bonds
elif current_block_type == "bond_block":
line = line.strip()
if line == ":::":
current_block_type = None
else:
fields = re.split(" +", line)
bond_number, atom1, atom2, bond_order = (
int(fields[0]),
int(fields[1]),
int(fields[2]),
int(fields[3]),
)
n_atoms = len(this_geometry)
if not 1 <= atom1 <= n_atoms or not 1 <= atom2 <= n_atoms:
raise ValueError(f"atom number out of range: {line}")
bond_order = int(fields[3])
if bond_order <= 0:
raise ValueError(f"zero or negative bond order: {line}")
if this_bonds.number_of_edges() != bond_number - 1:
raise ValueError(f"non-sequential bond number (expected {this_bonds.number_of_edges()+1} but got {bond_number})")
if this_bonds.has_edge(atom1, atom2):
current_bond_order = this_bonds[atom1][atom2]["weight"]
if current_bond_order != bond_order:
raise ValueError(f"inconsistent bond order definition: {line}")
this_bonds.add_edge(atom1, atom2, weight=bond_order)
this_bonds.add_edge(atom2, atom1, weight=bond_order)
# convert to numpy array
geometries = np.array(geometries)
symbols = np.array(symbols)
property_names = np.array(property_names)
property_values = np.array(property_values)
# determine if these are conformers
if contains_conformers == "check":
contains_conformers = True
for this_symbols, this_bonds in zip(symbols[1:], bonds[1:]):
# must have the same symbols and bonds
if not (symbols[0] == this_symbols).all() or not nx.is_isomorphic(bonds[0], this_bonds):
contains_conformers = False
break
elif isinstance(contains_conformers, bool):
pass
else:
raise ValueError("contains_conformers must be 'check' or boolean")
# if requested, just store one copy of symbols and bonds
if save_memory_for_conformers and contains_conformers:
symbols = symbols[0]
bonds = bonds[0]
# return result
n_geometries = len(geometries)
if print_status_messages:
if n_geometries > 1:
if contains_conformers:
n_atoms = len(geometries[0])
n_bonds = bonds.number_of_edges()
if print_status_messages:
print(f"read {n_geometries} conformers ({n_atoms} atoms and {n_bonds} bonds).")
else:
min_n_atoms = len(geometries[0])
max_n_atoms = len(geometries[0])
for geometry in geometries[1:]:
if len(geometry) > max_n_atoms:
max_n_atoms = len(geometry)
elif len(geometry) < min_n_atoms:
min_n_atoms = len(geometry)
min_n_bonds = bonds[0].number_of_edges()
max_n_bonds = bonds[0].number_of_edges()
for this_bonds in bonds[1:]:
if this_bonds.number_of_edges() > max_n_bonds:
max_n_bonds = this_bonds.number_of_edges()
elif this_bonds.number_of_edges() < min_n_bonds:
min_n_bonds = bonds.number_of_edges
if print_status_messages:
print(f"read {n_geometries} unrelated geometries ({min_n_atoms}-{max_n_atoms} atoms and {min_n_bonds}-{max_n_bonds}) bonds).")
else:
n_atoms = len(geometries)
n_bonds = bonds.number_of_edges()
if print_status_messages:
print(f"read one geometry ({n_atoms} atoms and {n_bonds} bonds).")
# return result
return (
geometries,
symbols,
bonds,
property_names,
property_values,
contains_conformers,
)
[docs]
def get_molecule(self, num=None):
"""
Returns the last molecule from the ensemble.
If ``num`` is specified, returns ``self.ensemble.molecules[num]``
"""
# some methods pass num=None, which overrides setting the default above
if num is None:
num = -1
if not isinstance(num, int):
raise TypeError("num must be int")
return self.ensemble.molecules[num]