Source code for cctk.ensemble

import numpy as np
from copy import deepcopy

import cctk
from cctk.helper_functions import align_matrices



[docs]
class Ensemble:
    """
    Class representing a collection of molecules. They do not all need to have the same atoms or bonds.

    Ensembles are composed of molecules and properties. Molecules are ``Molecule`` objects, whereas properties are ``dict`` objects containing calculation-specific information.

    There are various shortcuts for handling ``Ensemble`` objects:

    - ``ensemble[molecule]`` or ``ensemble[0]`` will return new ``Ensemble`` objects with only the specified molecules.
        Lists or slices can also be used: so ``ensemble[0:10:2]`` or ``ensemble[[molecule1, molecule2, molecule3]]`` will also return new ``Ensemble`` objects.
    - Individual properties can be read through tuple indexing: ``ensemble[0,"energy"]`` will return the energy of the first molecule,
        while ``ensemble[:,"energy"]`` will return a list of all the energies.
    - To access ``Molecule`` objects, use ``ensemble.molecule``: ``ensemble.molecule[0]`` will return the first object, whereas ``ensemble.molecule[1:3]`` will return a list.
    - ``ensemble.items()`` will return a list of (molecule, property) pairs.
    - ``ensemble.molecule_list()`` and ``ensemble.properties_list()`` return lists of molecules and properties, respectively.

    Attributes:
        name (str): name, for identification
        _items (dict): keys: ``Molecule`` objects; values: dictionaries containing properties from each molecule, variable. should always be one layer deep.
        molecules (``MoleculeIndexer``): special object that accesses the keys
    """


[docs]
    def __init__(self, name=None):
        """
        Create new instance.

        Args:
            name (str): name of Ensemble
        """
        self.name = name
        self._items = {}
        self.molecules = self._MoleculeIndexer(self)


    def __str__(self):
        name = "None" if self.name is None else self.name
        return f"Ensemble (name={name}, {len(self._items)} molecules)"

    def __getitem__(self, key):
        if isinstance(key, (int, np.integer)):
            mol = self.molecule_list()[key]
            prop = self.properties_list()[key]
            new = type(self)(name=self.name) # will return either Ensemble or subclass thereof
            new.add_molecule(mol, properties=prop)
            return new
        elif isinstance(key, cctk.Molecule):
            idx = self.molecule_list().index(key)
            return self[idx]
        elif isinstance(key, (list, np.ndarray)):
            new_list = [self[k] for k in key]
            return self.join_ensembles(new_list, name=self.name)
        elif isinstance(key, slice):
            start, stop, step = key.indices(len(self))
            return self[list(range(start, stop, step))]
        elif isinstance(key, tuple):
            return self.get_property(key[0], key[1])
        elif key is None:
            return self
        else:
            raise KeyError(f"not a valid datatype for Ensemble key: {type(key)}")

    def __setitem__(self, key, item):
        assert isinstance(key, tuple), "need two indexes to set a value in an ensemble!"
        idx = key[0]
        name = key[1]

        if isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
            self[list(range(start, stop, step)), name] = item
        elif isinstance(idx, (list, np.ndarray)) and isinstance(item, (list, np.ndarray)):
            assert len(idx) == len(item), f"can't set {len(item)} items into {len(key)} variables (cf. pigeonhole principle)"
            for (k, i) in zip(idx, item):
                self[k, name] = i
        elif isinstance(idx, (list, np.ndarray)):
            for k in idx:
                self[k, name] = item
        elif isinstance(idx, (int, np.integer)):
            mol = self.molecule_list()[idx]
            self[mol, name] = item
        elif isinstance(idx, cctk.Molecule):
            if isinstance(name, (list, np.ndarray)):
                for n in name:
                    self[idx,n] = item
            #### we can't assign multiple items to a list of names since that would preclude assigning a list to a single variable
            else:
                self._items[idx][name] = item
        else:
            raise KeyError(f"not a valid datatype for Ensemble index: {type(idx)}")

    def __len__(self):
        return len(self._items)

    def __iter__(self):
        return iter(self.items())


[docs]
    def keys(self):
        return self._items.keys()



[docs]
    def values(self):
        return self._items.values()



[docs]
    def molecule_list(self):
        """
        Returns a list of the constituent molecules.
        """
        return list(self.keys())



[docs]
    def properties_list(self):
        """
        Returns a list of dictionaries. One dictionary per geometry. Each dictionary contains the property names and property values for each geometry in the ensemble. 
        """
        return list(self.values())



[docs]
    def has_property(self, idx, prop):
        """
        Returns ``True`` if property is defined for index ``idx`` and ``False`` otherwise.
        """
        combined = self.combined_properties()
        if prop in combined:
            return True
        else:
            return False



[docs]
    def combined_properties(self):
        """
        Returns a dictionary containing the most up-to-date version of each property.
        """
        combined = dict()
        for p in self.properties_list():
            combined = {**combined, **p}
        return combined



[docs]
    def get_property(self, idx, prop):
        """
        """
        ensemble = self[idx]
        result = []
        for m, p in ensemble.items():
            if isinstance(prop, list):
                row = []
                for x in prop:
                    if x in p:
                        row.append(p[x])
                    else:
                        row.append(None)
                result.append(row)
            else:
                if prop in p:
                    result.append(p[prop])
                else:
                    result.append(None)
        if len(ensemble) == 1:
            if result[0] is None:
                return None
            return result[0]
        else:
            found_something = False
            for x in result:
                if x is not None:
                    found_something = True
                    break
            if found_something:
                return result
            else:
                return None



[docs]
    def get_properties_dict(self, idx):
        """
            Returns the dictionary of molecule properties for the specified molecule.

            Args:
                idx (int or cctk.Molecule): a molecule belonging to this ensemble, either
                                            0-indexed or given explicitly as a Molecule

            Returns:
                the property dict corresponding to this Molecule
        """
        assert isinstance(idx, (int, np.integer, cctk.Molecule)), "index must be int or Molecule"
        ensemble = self[idx]
        assert len(ensemble) == 1, "idx returned too many ensembles"
        return ensemble.properties_list()[0]



[docs]
    def items(self):
        """
        Returns a list of (molecule, properties) tuple pairs.
        """
        return self._items.items()


    # object to allow convenient indexing of the molecules in the ensemble  
    #
    # allowed use cases
    #
    # retrieving molecules:
    # ensemble.molecules[0]: first molecule
    # ensemble.molecules[-1]: last molecule
    # ensemble.molecules[[0,1]]: first two molecules as a list
    # ensemble.molecules[0:4:2]: first and third molecules as a list
    #
    # setting molecule properties this way is not allowed
    class _MoleculeIndexer():
        def __init__(self, ensemble):
            self.ensemble = ensemble

        def __getitem__(self, key):
            items_list = list(self.ensemble._items.keys())
            n_items = len(items_list)
            if isinstance(key, (int, np.integer)):
                self._check_key(key, n_items)
                return items_list[key]
            if isinstance(key, np.ndarray):
                assert len(np.shape(key)) == 1, f"multidimensional keys not allowed, shape was {np.shape(key)}"
            if isinstance(key, (list, np.ndarray)):
                return_list = []
                for k in key:
                    assert isinstance(k, (int, np.integer)), f"key {k} in {str(key)} is not an integer, type is {str(type(k))}"
                    self._check_key(k, n_items)
                    return_list.append(items_list[k])
                return return_list
            elif isinstance(key, slice):
                start, stop, step = key.indices(n_items)
                return [ items_list[i] for i in range(start, stop, step) ]
            else:
                raise ValueError(f"cannot index with type {str(type(key))}")

        def __setitem__(self, key, item):
            raise LookupError("cannot set molecule properties this way; use ensemble.set_property_dict(molecule, property_dict) instead")

        def _check_key(self, key, n_items):
            assert -n_items <= key < n_items, f"key {key} is out of range...must be between {-n_items} and {n_items-1} inclusive"

        def __iter__(self):
            return iter(self.ensemble.molecule_list())


[docs]
    def properties(self, num=None):
        """
        Returns a list of the constituent properties.
        """
        if num is None:
            return list(self.values())
        else:
            assert isinstance(num, int), "num must be integer"
            return list(self.values())[num]



[docs]
    def sort_by(self, property_name, ascending=True):
        """
        Sorts the ensemble by the specified property.
        Throws an error if the property is missing for any entries.
        Consistent, sort-compatible property values are assumed and not checked.

        Args:
            property_name (str): the name of the property to sort on (must be a string or number)
            ascending (bool): whether the property should increase or decrease in value
        Returns:
            new Ensemble (current ensemble is not modified)
        """
        property_list = self[:,property_name]
        if property_list is None:
            raise ValueError(f"property '{property_name}' not found in ensemble")
        property_list = np.asarray(property_list)
        n_missing_entries = np.count_nonzero(property_list==None) # noqa
        if n_missing_entries > 0:
            error = "---sorting error---\n"
            error += str(property_list)
            raise ValueError(f"{error}\nproperty '{property_name}' has {n_missing_entries} missing entries and cannot be sorted")
        new_indices = np.argsort(property_list)
        if not ascending:
            new_indices = np.flip(new_indices)
        return self[[new_indices]]



[docs]
    def add_molecule(self, molecule, properties=None, copy=False):
        """
        Adds a molecule to the ensemble.

        Args:
            molecule (Molecule): the molecule to be added
            properties (dict): property name (str) to property value
            copy (bool): whether to store an independent copy of the molecule
        """
        if not isinstance(molecule, cctk.Molecule):
            raise TypeError("molecule is not a Molecule - so it can't be added!")

        if copy:
            molecule = deepcopy(molecule)

        if properties is None:
            #### empty dicts all point to the same memory address by default, so need to prevent that behavior by initializing non-empty dict
            properties = {"placeholder": 1}
            del properties["placeholder"]

        assert isinstance(properties, dict), f"properties must be a dict and not type {type(properties)}"

        self._items[molecule] = properties


    def _check_molecule_number(self, number):
        """
        Helper method which performs quick checks on the validity of a given molecule number.
        """
        try:
            number = int(number)
        except Exception:
            raise TypeError(f"atom number {number} must be integer")

        if number >= len(self._items):
            raise ValueError(f"atom number {number} too large!")


[docs]
    @classmethod
    def join_ensembles(cls, ensembles, name=None):
        """
        Creates a new Ensemble object from existing ensembles.

        If every ensemble has energies defined, then the new ensemble will have energies defined too.

        Args:
            name (str): name of Ensemble created
            ensembles (list of Ensembles): Ensemble objects to join
        """
        new_ensemble = Ensemble(name=name)
        for ensemble in ensembles:
            assert isinstance(ensemble, Ensemble), "can't join an object that isn't an Ensemble!"

        for ensemble in ensembles:
            new_ensemble._items.update(ensemble.items())

        return new_ensemble



[docs]
    def lowest_molecules(self, property_name, num=1):
        """
        Retrieves the molecules with the lowest values of the specified property.

        Args:
            property_name (str): the name of the property to sort on
            num (int): how many molecules to return
        Returns:
            lowest ``Molecule`` (if num==1)
            ``list`` of ``Molecule`` (otherwise)
        """
        assert isinstance(num, (int, np.integer)), f"num must be an integer, got {type(num)}"
        assert num > 0, f"num must be > 0, got {num}"
        sorted_ensemble = self.sort_by(property_name)
        if num > 1:
            return sorted_ensemble.molecules[0:num]
        return sorted_ensemble.molecules[0]




[docs]
class ConformationalEnsemble(Ensemble):
    """
    Class that representing a group of conformers. All members must have the same atom types in the same order.
    """

    def __str__(self):
        n_atoms = 0
        if len(self._items) > 0:
            first_molecule = self.molecule_list()[0]
            n_atoms = first_molecule.num_atoms()
        if self.name is not None:
            return f"ConformationalEnsemble (name={self.name}, {len(self._items)} molecules, {n_atoms} atoms)"
        else:
            return f"ConformationalEnsemble ({len(self._items)} molecules, {n_atoms} atoms)"


[docs]
    def add_molecule(self, molecule, properties=None, copy=False, checks=True):
        """
        Checks that the molecule contains the same atom types in the same order as existing molecules, and that the molecule has the same charge/multiplicity.
        """
        if len(self._items) > 0:
            initial_mol = self.molecule_list()[0]
            if molecule.num_atoms() != initial_mol.num_atoms():
                raise ValueError("wrong number of atoms for this ensemble")

            if molecule.charge != initial_mol.charge:
                raise ValueError("wrong charge for this ensemble")

            if molecule.multiplicity != initial_mol.multiplicity:
                raise ValueError("wrong spin multiplicity for this ensemble")

            if checks and not np.array_equal(molecule.atomic_numbers, initial_mol.atomic_numbers):
                raise ValueError("wrong atom types for this ensemble")

            #### only save one copy to save space
            molecule.bonds = initial_mol.bonds
            molecule.atomic_numbers = initial_mol.atomic_numbers

        super().add_molecule(molecule, properties, copy)



[docs]
    @classmethod
    def join_ensembles(cls, ensembles, name=None, copy=False):
        """
        Creates a new ConformationalEnsemble object from existing ensembles.
        Both molecules and properties are copied.

        Args:
            name (str): name of ConformationalEnsemble created
            ensembles (list of ConformationalEnsembles): ConformationalEnsemble objects to join
            copy (bool): whether to make copies of the component molecules
        """
        new_ensemble = ConformationalEnsemble(name=name)
        for ensemble in ensembles:
            assert isinstance(ensemble, ConformationalEnsemble), "can't join an object that isn't a ConformationalEnsemble!"

        for ensemble in ensembles:
            for mol, prop in ensemble.items():
                new_ensemble.add_molecule(mol, prop, copy)

        return new_ensemble



[docs]
    def align(self, to_geometry=0, comparison_atoms="heavy", compute_RMSD=False):
        """
        Aligns every geometry in this ensemble to the specified geometry,
        optionally computing the root-mean-square distance between each
        geometry and the reference geometry.

        Alignments are based on `atom_numbers`.
        The current ensemble will not be altered.  RMSDs will be calculated over the
        comparison atoms only.

        Args:
            to_geometry (int): the reference geometry to align to (0-indexed)
            comparison_atoms (str or list): which atoms to use when computing alignments
                                            "heavy" for all non-hydrogen atoms,
                                            "all" for all atoms, or
                                            a list of 1-indexed atom numbers
            compute_RMSD (Bool): whether to return RMSD before and after rotation

        Returns:
            new aligned ``ConformationalEnsemble`` or
            new aligned ``ConformationalEnsemble``, before_RMSD array, after_RMSD array
        """
        # check inputs
        self._check_molecule_number(to_geometry)
        n_atoms = self.molecules[0].num_atoms()

        if isinstance(comparison_atoms, str):
            if comparison_atoms == "all":
                comparison_atoms = np.arange(1, n_atoms + 1)
            elif comparison_atoms == "heavy":
                comparison_atoms = self.molecules[0].get_heavy_atoms()
        assert isinstance(comparison_atoms, (list, np.ndarray, cctk.OneIndexedArray)), f"unexpected type for comparison_atoms: {str(type(comparison_atoms))}"
        for a in comparison_atoms:
            assert 1 <= a <= n_atoms, f"atom number out of range: got {a}, but must be between 1 and {n_atoms}"

        assert len(comparison_atoms) >= 3, f"need at least 3 atoms for alignment, but only got {len(comparison_atoms)}"

        # duplicate the ensemble
        new_ensemble = deepcopy(self)

        # translate all molecules to the origin
        # with respect to the comparison atoms
        for molecule, _ in new_ensemble:
            full_geometry = molecule.geometry
            partial_geometry = full_geometry[comparison_atoms]
            translation_vector = -partial_geometry.mean(axis=0)
            molecule.translate_molecule(translation_vector)

        full_template_geometry = new_ensemble.molecules[to_geometry].geometry
        partial_template_geometry = full_template_geometry[comparison_atoms]
        before_RMSDs = []
        after_RMSDs = []

        # perform alignment using Kabsch algorithm
        for i, (molecule, _) in enumerate(new_ensemble):
            full_geometry = molecule.geometry
            partial_geometry = full_geometry[comparison_atoms]
            if compute_RMSD:
                before_RMSD = cctk.helper_functions.compute_RMSD(partial_template_geometry, partial_geometry)
                before_RMSDs.append(before_RMSD)
            new_geometry = align_matrices(partial_geometry, full_geometry, partial_template_geometry)
            molecule.geometry = new_geometry
            if compute_RMSD:
                partial_geometry = new_geometry[comparison_atoms]
                after_RMSD = cctk.helper_functions.compute_RMSD(partial_template_geometry, partial_geometry)
                after_RMSDs.append(after_RMSD)
            assert len(molecule.geometry) == n_atoms, f"wrong number of geometry elements! expected {n_atoms}, got {len(molecule.geometry)}"

        if compute_RMSD:
            return new_ensemble, before_RMSDs, after_RMSDs
        return new_ensemble



[docs]
    def eliminate_redundant(self, RMSD_cutoff=0.5, comparison_atoms="heavy", return_RMSD=False):
        """
        Aligns every geometry in this ensemble and then creates a new ensemble that contains only the non-redundant conformers.
        If energies are available, the lowest energy conformer will be kept for every redundancy.
        The current ensemble will not be modified.  The resulting ensemble will be sorted by energy (if available).

        Args:
            RMSD_cutoff (float): remove conformers that are more similar than this threshold
            to_geometry (int): the reference geometry to align to (0-indexed)
            comparison_atoms (str or list): which atoms to use when computing alignments
                                            "heavy" for all non-hydrogen atoms,
                                            "all" for all atoms, or
                                            a list of 1-indexed atom numbers
            return_RMSD (bool): whether or not to return list of RMSD values

        Returns:
            new ``ConformationalEnsemble``, RMSDs to the reference geometry
        """
        # check inputs
        n_atoms = self.molecules[0].num_atoms()
        if isinstance(comparison_atoms, str):
            if comparison_atoms == "all":
                comparison_atoms = np.arange(1, n_atoms + 1)
            elif comparison_atoms == "heavy":
                comparison_atoms = self.molecules[0].get_heavy_atoms()

        assert isinstance(comparison_atoms, (list, np.ndarray, cctk.OneIndexedArray)), f"unexpected type for comparison_atoms: {str(type(comparison_atoms))}"
        for a in comparison_atoms:
            assert 1 <= a <= n_atoms, f"atom number out of range: got {a}, but must be between 1 and {n_atoms}"
        assert len(comparison_atoms) >= 3, f"need at least 3 atoms for alignment, but only got {len(comparison_atoms)}"

        assert isinstance(RMSD_cutoff, (float, int)), f"RMSD cutoff must be a float but got {str(type(RMSD_cutoff))}"
        assert RMSD_cutoff > 0.0001, "must use a big enough RMSD cutoff"

        # align all molecules
        old_ensemble = self.align(to_geometry=0, comparison_atoms=comparison_atoms, compute_RMSD=False)

        # sort molecules by energy if available
        energies_available = True
        for molecule,properties in old_ensemble.items():
            if "energy" not in properties:
                energies_available = False
                break

        n_molecules = len(old_ensemble)
        sorted_indices = list(range(n_molecules))
        if energies_available:
            energies = old_ensemble[:,"energy"]
            sorted_indices = list(np.argsort(energies))

        # boolean indexing noticeably faster
        idxs = np.array(comparison_atoms)
        mask = np.zeros(old_ensemble.molecules[0].geometry.shape[0], dtype=bool)
        mask[idxs - 1] = True

        partial_geoms = [m.geometry[mask] for m in old_ensemble.molecules]
        new_partial_geoms = []

        rmsds = list()

        # add molecules one by one
        new_ensemble = ConformationalEnsemble()
        for i in sorted_indices:
            ok_to_add = True

            candidate_rmsd = 0
            for existing_molecule in new_partial_geoms:
                candidate_rmsd = cctk.helper_functions.compute_RMSD(partial_geoms[i], existing_molecule, checks=False)
                if candidate_rmsd < RMSD_cutoff:
                    ok_to_add = False
                    break

            if ok_to_add:
                candidate_molecule = old_ensemble.molecules[i]
                candidate_molecule_properties = old_ensemble.get_properties_dict(candidate_molecule)

                new_ensemble.add_molecule(candidate_molecule, candidate_molecule_properties)
                new_partial_geoms.append(candidate_molecule.geometry[mask])
                rmsds.append(candidate_rmsd)

        if return_RMSD:
            return new_ensemble, rmsds
        else:
            return new_ensemble



[docs]
    def get_geometric_parameters(self, parameter, atom1, atom2, atom3=None, atom4=None):
        """
        Computes and outputs geometric parameters (bond distances, angles, or dihedral angles) for every member of ``self.molecules.``

        Args:
            parameter (str): one of ``angle``, ``distance``, or ``dihedral``
            atom1 (int): number of the atom in question
            atom2 (int): same, but for the second atom
            atom3 (int): same, but for the third atom (only required for parameter ``angle`` or ``dihedral``)
            atom4 (int): same, but for the fourth atom (only required for parameter ``dihedral``)

        Returns:
            a list of the specified parameter's values for each geometry
        """
        output = [None] * len(self)
        for index, molecule in enumerate(self.molecule_list()):
            if parameter == "distance":
                output[index] = molecule.get_distance(atom1, atom2)
            elif parameter == "angle":
                if atom3 is None:
                    raise ValueError("need atom3 to calculate angle!")
                output[index] = molecule.get_angle(atom1, atom2, atom3)
            elif parameter == "dihedral":
                if (atom3 is None) or (atom4 is None):
                    raise ValueError("need atom3 and atom4 to calculate dihedral!")
                output[index] = molecule.get_dihedral(atom1, atom2, atom3, atom4)
            else:
                raise ValueError(f"Invalid parameter {parameter}!")

        return output



[docs]
    def assign_connectivity(self, index=0):
        """
        Assigns connectivity for all molecules based on molecule of index ``index``. Much faster than assigning connectivity for each individually -- but assumes all bonding is the same.
        """
        assert isinstance(index, int), "Need integer index"
        bonds = self.molecules[index].assign_connectivity().bonds

        for mol in self.molecules:
            mol.bonds = bonds

        return self



[docs]
    def boltzmann_average(self, which, energies=None, temp=298, energy_unit="hartree", return_weights=False):
        """
        Computes the Boltzmann-weighted average of a property over the whole ensemble.

        Args:
            which (str): which property to compute
            energy (np.ndarray): list of energies to use for weighting.
                Will default to ``self[:,"energy"]``, although other strings can be passed as well as shorthand for ``self[:,energy]``.
            temp (float): temperature for Boltzmann-weighting, in K
            energy_unit (str): either ``kcal_mol`` or ``hartree``
            return_weights (bool): whether to return a list of weights too

        Returns:
            weighted property, of the same shape as the individual property
        """
        if energies is None:
            energies = self[:,"energy"]
        elif isinstance(energies, str):
            energies = self[:,energies]
        elif isinstance(energies, (list, np.ndarray, cctk.OneIndexedArray)):
            pass
        else:
            raise ValueError(f"invalid energy value {energies} (type {type(energies)})")

        for i, (m, pd) in enumerate(self.items()):
            assert which in pd, f"molecule #{i} doesn't have property {which} defined!"

        values = np.array(self[:,which], dtype=np.float64)
        energies = np.array(energies, dtype=np.float64)

        assert len(energies) == len(self)
        assert len(values) == len(self)
        assert all([e is not None for e in energies]), "energy not defined for all molecules"
        assert all([v is not None for v in values]), f"property {which} not defined for all molecules"

        # perhaps at some point we will need a real unit system like simtk/OpenMM, but not today!
        if energy_unit == "kcal_mol":
            energies = energies / 627.509
        energies = energies - np.min(energies)

        R = 3.1668105e-6 # eH/K

        weights = np.exp(-1*energies/(R*temp))
        weights = weights / np.sum(weights)

        try:
            weighted_value = np.average(values, weights=weights)
        except Exception as e:
            raise ValueError(f"error computing Boltzmann average: {e}")

        if return_weights:
            return weighted_value, weights
        else:
            return weighted_value