Source code for cctk.ensemble
import numpy as np
from copy import deepcopy
import cctk
from cctk.helper_functions import align_matrices
[docs]
class Ensemble:
"""
Class representing a collection of molecules. They do not all need to have the same atoms or bonds.
Ensembles are composed of molecules and properties. Molecules are ``Molecule`` objects, whereas properties are ``dict`` objects containing calculation-specific information.
There are various shortcuts for handling ``Ensemble`` objects:
- ``ensemble[molecule]`` or ``ensemble[0]`` will return new ``Ensemble`` objects with only the specified molecules.
Lists or slices can also be used: so ``ensemble[0:10:2]`` or ``ensemble[[molecule1, molecule2, molecule3]]`` will also return new ``Ensemble`` objects.
- Individual properties can be read through tuple indexing: ``ensemble[0,"energy"]`` will return the energy of the first molecule,
while ``ensemble[:,"energy"]`` will return a list of all the energies.
- To access ``Molecule`` objects, use ``ensemble.molecule``: ``ensemble.molecule[0]`` will return the first object, whereas ``ensemble.molecule[1:3]`` will return a list.
- ``ensemble.items()`` will return a list of (molecule, property) pairs.
- ``ensemble.molecule_list()`` and ``ensemble.properties_list()`` return lists of molecules and properties, respectively.
Attributes:
name (str): name, for identification
_items (dict): keys: ``Molecule`` objects; values: dictionaries containing properties from each molecule, variable. should always be one layer deep.
molecules (``MoleculeIndexer``): special object that accesses the keys
"""
[docs]
def __init__(self, name=None):
"""
Create new instance.
Args:
name (str): name of Ensemble
"""
self.name = name
self._items = {}
self.molecules = self._MoleculeIndexer(self)
def __str__(self):
name = "None" if self.name is None else self.name
return f"Ensemble (name={name}, {len(self._items)} molecules)"
def __getitem__(self, key):
if isinstance(key, (int, np.integer)):
mol = self.molecule_list()[key]
prop = self.properties_list()[key]
new = type(self)(name=self.name) # will return either Ensemble or subclass thereof
new.add_molecule(mol, properties=prop)
return new
elif isinstance(key, cctk.Molecule):
idx = self.molecule_list().index(key)
return self[idx]
elif isinstance(key, (list, np.ndarray)):
new_list = [self[k] for k in key]
return self.join_ensembles(new_list, name=self.name)
elif isinstance(key, slice):
start, stop, step = key.indices(len(self))
return self[list(range(start, stop, step))]
elif isinstance(key, tuple):
return self.get_property(key[0], key[1])
elif key is None:
return self
else:
raise KeyError(f"not a valid datatype for Ensemble key: {type(key)}")
def __setitem__(self, key, item):
assert isinstance(key, tuple), "need two indexes to set a value in an ensemble!"
idx = key[0]
name = key[1]
if isinstance(idx, slice):
start, stop, step = idx.indices(len(self))
self[list(range(start, stop, step)), name] = item
elif isinstance(idx, (list, np.ndarray)) and isinstance(item, (list, np.ndarray)):
assert len(idx) == len(item), f"can't set {len(item)} items into {len(key)} variables (cf. pigeonhole principle)"
for (k, i) in zip(idx, item):
self[k, name] = i
elif isinstance(idx, (list, np.ndarray)):
for k in idx:
self[k, name] = item
elif isinstance(idx, (int, np.integer)):
mol = self.molecule_list()[idx]
self[mol, name] = item
elif isinstance(idx, cctk.Molecule):
if isinstance(name, (list, np.ndarray)):
for n in name:
self[idx,n] = item
#### we can't assign multiple items to a list of names since that would preclude assigning a list to a single variable
else:
self._items[idx][name] = item
else:
raise KeyError(f"not a valid datatype for Ensemble index: {type(idx)}")
def __len__(self):
return len(self._items)
def __iter__(self):
return iter(self.items())
[docs]
def molecule_list(self):
"""
Returns a list of the constituent molecules.
"""
return list(self.keys())
[docs]
def properties_list(self):
"""
Returns a list of dictionaries. One dictionary per geometry. Each dictionary contains the property names and property values for each geometry in the ensemble.
"""
return list(self.values())
[docs]
def has_property(self, idx, prop):
"""
Returns ``True`` if property is defined for index ``idx`` and ``False`` otherwise.
"""
combined = self.combined_properties()
if prop in combined:
return True
else:
return False
[docs]
def combined_properties(self):
"""
Returns a dictionary containing the most up-to-date version of each property.
"""
combined = dict()
for p in self.properties_list():
combined = {**combined, **p}
return combined
[docs]
def get_property(self, idx, prop):
"""
"""
ensemble = self[idx]
result = []
for m, p in ensemble.items():
if isinstance(prop, list):
row = []
for x in prop:
if x in p:
row.append(p[x])
else:
row.append(None)
result.append(row)
else:
if prop in p:
result.append(p[prop])
else:
result.append(None)
if len(ensemble) == 1:
if result[0] is None:
return None
return result[0]
else:
found_something = False
for x in result:
if x is not None:
found_something = True
break
if found_something:
return result
else:
return None
[docs]
def get_properties_dict(self, idx):
"""
Returns the dictionary of molecule properties for the specified molecule.
Args:
idx (int or cctk.Molecule): a molecule belonging to this ensemble, either
0-indexed or given explicitly as a Molecule
Returns:
the property dict corresponding to this Molecule
"""
assert isinstance(idx, (int, np.integer, cctk.Molecule)), "index must be int or Molecule"
ensemble = self[idx]
assert len(ensemble) == 1, "idx returned too many ensembles"
return ensemble.properties_list()[0]
[docs]
def items(self):
"""
Returns a list of (molecule, properties) tuple pairs.
"""
return self._items.items()
# object to allow convenient indexing of the molecules in the ensemble
#
# allowed use cases
#
# retrieving molecules:
# ensemble.molecules[0]: first molecule
# ensemble.molecules[-1]: last molecule
# ensemble.molecules[[0,1]]: first two molecules as a list
# ensemble.molecules[0:4:2]: first and third molecules as a list
#
# setting molecule properties this way is not allowed
class _MoleculeIndexer():
def __init__(self, ensemble):
self.ensemble = ensemble
def __getitem__(self, key):
items_list = list(self.ensemble._items.keys())
n_items = len(items_list)
if isinstance(key, (int, np.integer)):
self._check_key(key, n_items)
return items_list[key]
if isinstance(key, np.ndarray):
assert len(np.shape(key)) == 1, f"multidimensional keys not allowed, shape was {np.shape(key)}"
if isinstance(key, (list, np.ndarray)):
return_list = []
for k in key:
assert isinstance(k, (int, np.integer)), f"key {k} in {str(key)} is not an integer, type is {str(type(k))}"
self._check_key(k, n_items)
return_list.append(items_list[k])
return return_list
elif isinstance(key, slice):
start, stop, step = key.indices(n_items)
return [ items_list[i] for i in range(start, stop, step) ]
else:
raise ValueError(f"cannot index with type {str(type(key))}")
def __setitem__(self, key, item):
raise LookupError("cannot set molecule properties this way; use ensemble.set_property_dict(molecule, property_dict) instead")
def _check_key(self, key, n_items):
assert -n_items <= key < n_items, f"key {key} is out of range...must be between {-n_items} and {n_items-1} inclusive"
def __iter__(self):
return iter(self.ensemble.molecule_list())
[docs]
def properties(self, num=None):
"""
Returns a list of the constituent properties.
"""
if num is None:
return list(self.values())
else:
assert isinstance(num, int), "num must be integer"
return list(self.values())[num]
[docs]
def sort_by(self, property_name, ascending=True):
"""
Sorts the ensemble by the specified property.
Throws an error if the property is missing for any entries.
Consistent, sort-compatible property values are assumed and not checked.
Args:
property_name (str): the name of the property to sort on (must be a string or number)
ascending (bool): whether the property should increase or decrease in value
Returns:
new Ensemble (current ensemble is not modified)
"""
property_list = self[:,property_name]
if property_list is None:
raise ValueError(f"property '{property_name}' not found in ensemble")
property_list = np.asarray(property_list)
n_missing_entries = np.count_nonzero(property_list==None) # noqa
if n_missing_entries > 0:
error = "---sorting error---\n"
error += str(property_list)
raise ValueError(f"{error}\nproperty '{property_name}' has {n_missing_entries} missing entries and cannot be sorted")
new_indices = np.argsort(property_list)
if not ascending:
new_indices = np.flip(new_indices)
return self[[new_indices]]
[docs]
def add_molecule(self, molecule, properties=None, copy=False):
"""
Adds a molecule to the ensemble.
Args:
molecule (Molecule): the molecule to be added
properties (dict): property name (str) to property value
copy (bool): whether to store an independent copy of the molecule
"""
if not isinstance(molecule, cctk.Molecule):
raise TypeError("molecule is not a Molecule - so it can't be added!")
if copy:
molecule = deepcopy(molecule)
if properties is None:
#### empty dicts all point to the same memory address by default, so need to prevent that behavior by initializing non-empty dict
properties = {"placeholder": 1}
del properties["placeholder"]
assert isinstance(properties, dict), f"properties must be a dict and not type {type(properties)}"
self._items[molecule] = properties
def _check_molecule_number(self, number):
"""
Helper method which performs quick checks on the validity of a given molecule number.
"""
try:
number = int(number)
except Exception:
raise TypeError(f"atom number {number} must be integer")
if number >= len(self._items):
raise ValueError(f"atom number {number} too large!")
[docs]
@classmethod
def join_ensembles(cls, ensembles, name=None):
"""
Creates a new Ensemble object from existing ensembles.
If every ensemble has energies defined, then the new ensemble will have energies defined too.
Args:
name (str): name of Ensemble created
ensembles (list of Ensembles): Ensemble objects to join
"""
new_ensemble = Ensemble(name=name)
for ensemble in ensembles:
assert isinstance(ensemble, Ensemble), "can't join an object that isn't an Ensemble!"
for ensemble in ensembles:
new_ensemble._items.update(ensemble.items())
return new_ensemble
[docs]
def lowest_molecules(self, property_name, num=1):
"""
Retrieves the molecules with the lowest values of the specified property.
Args:
property_name (str): the name of the property to sort on
num (int): how many molecules to return
Returns:
lowest ``Molecule`` (if num==1)
``list`` of ``Molecule`` (otherwise)
"""
assert isinstance(num, (int, np.integer)), f"num must be an integer, got {type(num)}"
assert num > 0, f"num must be > 0, got {num}"
sorted_ensemble = self.sort_by(property_name)
if num > 1:
return sorted_ensemble.molecules[0:num]
return sorted_ensemble.molecules[0]
[docs]
class ConformationalEnsemble(Ensemble):
"""
Class that representing a group of conformers. All members must have the same atom types in the same order.
"""
def __str__(self):
n_atoms = 0
if len(self._items) > 0:
first_molecule = self.molecule_list()[0]
n_atoms = first_molecule.num_atoms()
if self.name is not None:
return f"ConformationalEnsemble (name={self.name}, {len(self._items)} molecules, {n_atoms} atoms)"
else:
return f"ConformationalEnsemble ({len(self._items)} molecules, {n_atoms} atoms)"
[docs]
def add_molecule(self, molecule, properties=None, copy=False, checks=True):
"""
Checks that the molecule contains the same atom types in the same order as existing molecules, and that the molecule has the same charge/multiplicity.
"""
if len(self._items) > 0:
initial_mol = self.molecule_list()[0]
if molecule.num_atoms() != initial_mol.num_atoms():
raise ValueError("wrong number of atoms for this ensemble")
if molecule.charge != initial_mol.charge:
raise ValueError("wrong charge for this ensemble")
if molecule.multiplicity != initial_mol.multiplicity:
raise ValueError("wrong spin multiplicity for this ensemble")
if checks and not np.array_equal(molecule.atomic_numbers, initial_mol.atomic_numbers):
raise ValueError("wrong atom types for this ensemble")
#### only save one copy to save space
molecule.bonds = initial_mol.bonds
molecule.atomic_numbers = initial_mol.atomic_numbers
super().add_molecule(molecule, properties, copy)
[docs]
@classmethod
def join_ensembles(cls, ensembles, name=None, copy=False):
"""
Creates a new ConformationalEnsemble object from existing ensembles.
Both molecules and properties are copied.
Args:
name (str): name of ConformationalEnsemble created
ensembles (list of ConformationalEnsembles): ConformationalEnsemble objects to join
copy (bool): whether to make copies of the component molecules
"""
new_ensemble = ConformationalEnsemble(name=name)
for ensemble in ensembles:
assert isinstance(ensemble, ConformationalEnsemble), "can't join an object that isn't a ConformationalEnsemble!"
for ensemble in ensembles:
for mol, prop in ensemble.items():
new_ensemble.add_molecule(mol, prop, copy)
return new_ensemble
[docs]
def align(self, to_geometry=0, comparison_atoms="heavy", compute_RMSD=False):
"""
Aligns every geometry in this ensemble to the specified geometry,
optionally computing the root-mean-square distance between each
geometry and the reference geometry.
Alignments are based on `atom_numbers`.
The current ensemble will not be altered. RMSDs will be calculated over the
comparison atoms only.
Args:
to_geometry (int): the reference geometry to align to (0-indexed)
comparison_atoms (str or list): which atoms to use when computing alignments
"heavy" for all non-hydrogen atoms,
"all" for all atoms, or
a list of 1-indexed atom numbers
compute_RMSD (Bool): whether to return RMSD before and after rotation
Returns:
new aligned ``ConformationalEnsemble`` or
new aligned ``ConformationalEnsemble``, before_RMSD array, after_RMSD array
"""
# check inputs
self._check_molecule_number(to_geometry)
n_atoms = self.molecules[0].num_atoms()
if isinstance(comparison_atoms, str):
if comparison_atoms == "all":
comparison_atoms = np.arange(1, n_atoms + 1)
elif comparison_atoms == "heavy":
comparison_atoms = self.molecules[0].get_heavy_atoms()
assert isinstance(comparison_atoms, (list, np.ndarray, cctk.OneIndexedArray)), f"unexpected type for comparison_atoms: {str(type(comparison_atoms))}"
for a in comparison_atoms:
assert 1 <= a <= n_atoms, f"atom number out of range: got {a}, but must be between 1 and {n_atoms}"
assert len(comparison_atoms) >= 3, f"need at least 3 atoms for alignment, but only got {len(comparison_atoms)}"
# duplicate the ensemble
new_ensemble = deepcopy(self)
# translate all molecules to the origin
# with respect to the comparison atoms
for molecule, _ in new_ensemble:
full_geometry = molecule.geometry
partial_geometry = full_geometry[comparison_atoms]
translation_vector = -partial_geometry.mean(axis=0)
molecule.translate_molecule(translation_vector)
full_template_geometry = new_ensemble.molecules[to_geometry].geometry
partial_template_geometry = full_template_geometry[comparison_atoms]
before_RMSDs = []
after_RMSDs = []
# perform alignment using Kabsch algorithm
for i, (molecule, _) in enumerate(new_ensemble):
full_geometry = molecule.geometry
partial_geometry = full_geometry[comparison_atoms]
if compute_RMSD:
before_RMSD = cctk.helper_functions.compute_RMSD(partial_template_geometry, partial_geometry)
before_RMSDs.append(before_RMSD)
new_geometry = align_matrices(partial_geometry, full_geometry, partial_template_geometry)
molecule.geometry = new_geometry
if compute_RMSD:
partial_geometry = new_geometry[comparison_atoms]
after_RMSD = cctk.helper_functions.compute_RMSD(partial_template_geometry, partial_geometry)
after_RMSDs.append(after_RMSD)
assert len(molecule.geometry) == n_atoms, f"wrong number of geometry elements! expected {n_atoms}, got {len(molecule.geometry)}"
if compute_RMSD:
return new_ensemble, before_RMSDs, after_RMSDs
return new_ensemble
[docs]
def eliminate_redundant(self, RMSD_cutoff=0.5, comparison_atoms="heavy", return_RMSD=False):
"""
Aligns every geometry in this ensemble and then creates a new ensemble that contains only the non-redundant conformers.
If energies are available, the lowest energy conformer will be kept for every redundancy.
The current ensemble will not be modified. The resulting ensemble will be sorted by energy (if available).
Args:
RMSD_cutoff (float): remove conformers that are more similar than this threshold
to_geometry (int): the reference geometry to align to (0-indexed)
comparison_atoms (str or list): which atoms to use when computing alignments
"heavy" for all non-hydrogen atoms,
"all" for all atoms, or
a list of 1-indexed atom numbers
return_RMSD (bool): whether or not to return list of RMSD values
Returns:
new ``ConformationalEnsemble``, RMSDs to the reference geometry
"""
# check inputs
n_atoms = self.molecules[0].num_atoms()
if isinstance(comparison_atoms, str):
if comparison_atoms == "all":
comparison_atoms = np.arange(1, n_atoms + 1)
elif comparison_atoms == "heavy":
comparison_atoms = self.molecules[0].get_heavy_atoms()
assert isinstance(comparison_atoms, (list, np.ndarray, cctk.OneIndexedArray)), f"unexpected type for comparison_atoms: {str(type(comparison_atoms))}"
for a in comparison_atoms:
assert 1 <= a <= n_atoms, f"atom number out of range: got {a}, but must be between 1 and {n_atoms}"
assert len(comparison_atoms) >= 3, f"need at least 3 atoms for alignment, but only got {len(comparison_atoms)}"
assert isinstance(RMSD_cutoff, (float, int)), f"RMSD cutoff must be a float but got {str(type(RMSD_cutoff))}"
assert RMSD_cutoff > 0.0001, "must use a big enough RMSD cutoff"
# align all molecules
old_ensemble = self.align(to_geometry=0, comparison_atoms=comparison_atoms, compute_RMSD=False)
# sort molecules by energy if available
energies_available = True
for molecule,properties in old_ensemble.items():
if "energy" not in properties:
energies_available = False
break
n_molecules = len(old_ensemble)
sorted_indices = list(range(n_molecules))
if energies_available:
energies = old_ensemble[:,"energy"]
sorted_indices = list(np.argsort(energies))
# boolean indexing noticeably faster
idxs = np.array(comparison_atoms)
mask = np.zeros(old_ensemble.molecules[0].geometry.shape[0], dtype=bool)
mask[idxs - 1] = True
partial_geoms = [m.geometry[mask] for m in old_ensemble.molecules]
new_partial_geoms = []
rmsds = list()
# add molecules one by one
new_ensemble = ConformationalEnsemble()
for i in sorted_indices:
ok_to_add = True
candidate_rmsd = 0
for existing_molecule in new_partial_geoms:
candidate_rmsd = cctk.helper_functions.compute_RMSD(partial_geoms[i], existing_molecule, checks=False)
if candidate_rmsd < RMSD_cutoff:
ok_to_add = False
break
if ok_to_add:
candidate_molecule = old_ensemble.molecules[i]
candidate_molecule_properties = old_ensemble.get_properties_dict(candidate_molecule)
new_ensemble.add_molecule(candidate_molecule, candidate_molecule_properties)
new_partial_geoms.append(candidate_molecule.geometry[mask])
rmsds.append(candidate_rmsd)
if return_RMSD:
return new_ensemble, rmsds
else:
return new_ensemble
[docs]
def get_geometric_parameters(self, parameter, atom1, atom2, atom3=None, atom4=None):
"""
Computes and outputs geometric parameters (bond distances, angles, or dihedral angles) for every member of ``self.molecules.``
Args:
parameter (str): one of ``angle``, ``distance``, or ``dihedral``
atom1 (int): number of the atom in question
atom2 (int): same, but for the second atom
atom3 (int): same, but for the third atom (only required for parameter ``angle`` or ``dihedral``)
atom4 (int): same, but for the fourth atom (only required for parameter ``dihedral``)
Returns:
a list of the specified parameter's values for each geometry
"""
output = [None] * len(self)
for index, molecule in enumerate(self.molecule_list()):
if parameter == "distance":
output[index] = molecule.get_distance(atom1, atom2)
elif parameter == "angle":
if atom3 is None:
raise ValueError("need atom3 to calculate angle!")
output[index] = molecule.get_angle(atom1, atom2, atom3)
elif parameter == "dihedral":
if (atom3 is None) or (atom4 is None):
raise ValueError("need atom3 and atom4 to calculate dihedral!")
output[index] = molecule.get_dihedral(atom1, atom2, atom3, atom4)
else:
raise ValueError(f"Invalid parameter {parameter}!")
return output
[docs]
def assign_connectivity(self, index=0):
"""
Assigns connectivity for all molecules based on molecule of index ``index``. Much faster than assigning connectivity for each individually -- but assumes all bonding is the same.
"""
assert isinstance(index, int), "Need integer index"
bonds = self.molecules[index].assign_connectivity().bonds
for mol in self.molecules:
mol.bonds = bonds
return self
[docs]
def boltzmann_average(self, which, energies=None, temp=298, energy_unit="hartree", return_weights=False):
"""
Computes the Boltzmann-weighted average of a property over the whole ensemble.
Args:
which (str): which property to compute
energy (np.ndarray): list of energies to use for weighting.
Will default to ``self[:,"energy"]``, although other strings can be passed as well as shorthand for ``self[:,energy]``.
temp (float): temperature for Boltzmann-weighting, in K
energy_unit (str): either ``kcal_mol`` or ``hartree``
return_weights (bool): whether to return a list of weights too
Returns:
weighted property, of the same shape as the individual property
"""
if energies is None:
energies = self[:,"energy"]
elif isinstance(energies, str):
energies = self[:,energies]
elif isinstance(energies, (list, np.ndarray, cctk.OneIndexedArray)):
pass
else:
raise ValueError(f"invalid energy value {energies} (type {type(energies)})")
for i, (m, pd) in enumerate(self.items()):
assert which in pd, f"molecule #{i} doesn't have property {which} defined!"
values = np.array(self[:,which], dtype=np.float64)
energies = np.array(energies, dtype=np.float64)
assert len(energies) == len(self)
assert len(values) == len(self)
assert all([e is not None for e in energies]), "energy not defined for all molecules"
assert all([v is not None for v in values]), f"property {which} not defined for all molecules"
# perhaps at some point we will need a real unit system like simtk/OpenMM, but not today!
if energy_unit == "kcal_mol":
energies = energies / 627.509
energies = energies - np.min(energies)
R = 3.1668105e-6 # eH/K
weights = np.exp(-1*energies/(R*temp))
weights = weights / np.sum(weights)
try:
weighted_value = np.average(values, weights=weights)
except Exception as e:
raise ValueError(f"error computing Boltzmann average: {e}")
if return_weights:
return weighted_value, weights
else:
return weighted_value