Source code for cctk.mae_file

import re
import numpy as np
import networkx as nx

from cctk import File, Ensemble, ConformationalEnsemble, Molecule
from cctk.helper_functions import get_number


[docs] class MAEFile(File): """ Class representing Maestro ``.mae`` files. Attributes: name (str): name of file ensemble (Ensemble): ``Ensemble`` or ``ConformationalEnsemble`` object """
[docs] def __init__(self, name=None): if isinstance(name, str): self.name = name
[docs] @classmethod def read_file(cls, filename, name=None, **kwargs): """ Reads ``.mae`` file and generates a ``MAEFile`` instance. Args: filename (str): path to file name (str): name of the file Returns: MAEFile object property names (list) property_values (list) """ file = MAEFile(name=name) (geometries, symbols, bonds, p_names, p_vals, conformers) = cls._read_mae(filename, **kwargs) atomic_numbers = np.array([get_number(z) for z in symbols], dtype=np.int8) if conformers is True: file.ensemble = ConformationalEnsemble() else: file.ensemble = Ensemble() for geom in geometries: file.ensemble.add_molecule(Molecule(atomic_numbers, geom, bonds=bonds.edges)) return file, p_names, p_vals
@classmethod def _read_mae( cls, filename, contains_conformers="check", save_memory_for_conformers=True, print_status_messages=False, ): """ Reads uncompressed Macromodel files. Args: filename (str): path to file contains_conformers (str): one of ``check``, ``True``, or ``False`` save_memory_for_conformers (Bool): print_status_messages (Bool): Returns: geometries (np.ndarray): array of 3-tuples of geometries symbols (np.ndarray): array of atom symbols (str) bonds (nx.Graph): ``NetworkX`` graph of bond information property_names: property_values: contains_conformers (Bool): whether or not the file contains conformers """ # read file if print_status_messages: print(f"Reading {filename}...", end="", flush=True) lines = super().read_file(filename) if print_status_messages: print(f"read {len(lines)} lines...", end="", flush=True) # initialize arrays geometries = [] symbols = [] bonds = [] property_names = [] property_values = [] this_geometry = None this_symbols = None this_bonds = None this_property_names = None this_property_values = None # parse file i = 0 current_block_type = None while i < len(lines): # read the current line line = lines[i] i += 1 # determine if we are in a molecule block end_of_file = i + 1 == len(lines) if current_block_type is None and (line.startswith("f_m_ct") or end_of_file): # store the current results if any if this_geometry is not None and len(this_geometry) > 0: geometries.append(this_geometry) symbols.append(this_symbols) bonds.append(this_bonds) property_names.append(this_property_names) property_values.append(this_property_values) # prepare to read a new molecule current_block_type = "property_names" this_geometry = [] this_symbols = [] this_bonds = None this_property_names = [] this_property_values = [] continue # read property names elif current_block_type == "property_names": line = line.strip() if line.startswith("i_m_ct_format"): next_line = lines[i].strip() if next_line != ":::": raise ValueError(f"expected ':::' here but line {i+1} is:\n{next_line}\n") current_block_type = "property_values" i += 1 elif line.startswith(":::"): raise ValueError(f"expected to see i_m_ct_format as the last property (line {i+1})") else: fields = re.split(" +", line) if len(fields) != 1: raise ValueError(f"unexpected number of fields in property name line: {line}") this_property_names.append(line) # read property values elif current_block_type == "property_values": n_properties = len(this_property_names) for j in range(n_properties): this_property_values.append(lines[i + j]) i += n_properties current_block_type = "looking_for_geometry1" # look for geometry block elif current_block_type == "looking_for_geometry1": if line.startswith(" m_atom"): current_block_type = "looking_for_geometry2" elif current_block_type == "looking_for_geometry2": if line.strip() == ":::": current_block_type = "geometry_block" # parse geometry elif current_block_type == "geometry_block": line = line.strip() if line == ":::": current_block_type = "bond_block" # initialize bond connectivity graph this_bonds = nx.Graph() n_atoms = len(this_symbols) this_bonds.add_nodes_from(range(1, n_atoms + 1)) i += 7 else: fields = re.split(" +", line) x, y, z = float(fields[2]), float(fields[3]), float(fields[4]) this_geometry.append((x, y, z)) symbol = fields[-1] this_symbols.append(symbol) # parse bonds elif current_block_type == "bond_block": line = line.strip() if line == ":::": current_block_type = None else: fields = re.split(" +", line) bond_number, atom1, atom2, bond_order = ( int(fields[0]), int(fields[1]), int(fields[2]), int(fields[3]), ) n_atoms = len(this_geometry) if not 1 <= atom1 <= n_atoms or not 1 <= atom2 <= n_atoms: raise ValueError(f"atom number out of range: {line}") bond_order = int(fields[3]) if bond_order <= 0: raise ValueError(f"zero or negative bond order: {line}") if this_bonds.number_of_edges() != bond_number - 1: raise ValueError(f"non-sequential bond number (expected {this_bonds.number_of_edges()+1} but got {bond_number})") if this_bonds.has_edge(atom1, atom2): current_bond_order = this_bonds[atom1][atom2]["weight"] if current_bond_order != bond_order: raise ValueError(f"inconsistent bond order definition: {line}") this_bonds.add_edge(atom1, atom2, weight=bond_order) this_bonds.add_edge(atom2, atom1, weight=bond_order) # convert to numpy array geometries = np.array(geometries) symbols = np.array(symbols) property_names = np.array(property_names) property_values = np.array(property_values) # determine if these are conformers if contains_conformers == "check": contains_conformers = True for this_symbols, this_bonds in zip(symbols[1:], bonds[1:]): # must have the same symbols and bonds if not (symbols[0] == this_symbols).all() or not nx.is_isomorphic(bonds[0], this_bonds): contains_conformers = False break elif isinstance(contains_conformers, bool): pass else: raise ValueError("contains_conformers must be 'check' or boolean") # if requested, just store one copy of symbols and bonds if save_memory_for_conformers and contains_conformers: symbols = symbols[0] bonds = bonds[0] # return result n_geometries = len(geometries) if print_status_messages: if n_geometries > 1: if contains_conformers: n_atoms = len(geometries[0]) n_bonds = bonds.number_of_edges() if print_status_messages: print(f"read {n_geometries} conformers ({n_atoms} atoms and {n_bonds} bonds).") else: min_n_atoms = len(geometries[0]) max_n_atoms = len(geometries[0]) for geometry in geometries[1:]: if len(geometry) > max_n_atoms: max_n_atoms = len(geometry) elif len(geometry) < min_n_atoms: min_n_atoms = len(geometry) min_n_bonds = bonds[0].number_of_edges() max_n_bonds = bonds[0].number_of_edges() for this_bonds in bonds[1:]: if this_bonds.number_of_edges() > max_n_bonds: max_n_bonds = this_bonds.number_of_edges() elif this_bonds.number_of_edges() < min_n_bonds: min_n_bonds = bonds.number_of_edges if print_status_messages: print(f"read {n_geometries} unrelated geometries ({min_n_atoms}-{max_n_atoms} atoms and {min_n_bonds}-{max_n_bonds}) bonds).") else: n_atoms = len(geometries) n_bonds = bonds.number_of_edges() if print_status_messages: print(f"read one geometry ({n_atoms} atoms and {n_bonds} bonds).") # return result return ( geometries, symbols, bonds, property_names, property_values, contains_conformers, )
[docs] def get_molecule(self, num=None): """ Returns the last molecule from the ensemble. If ``num`` is specified, returns ``self.ensemble.molecules[num]`` """ # some methods pass num=None, which overrides setting the default above if num is None: num = -1 if not isinstance(num, int): raise TypeError("num must be int") return self.ensemble.molecules[num]