From 796ae3fdf9f272422baffb176c4822ae2ecb612d Mon Sep 17 00:00:00 2001 From: nmaeder Date: Wed, 8 Oct 2025 16:36:42 +0200 Subject: [PATCH 1/9] extract functions that could be used elsewhere --- serenityff/charge/gnn/utils/rdkit_helper.py | 97 +++++++++++++++++---- 1 file changed, 81 insertions(+), 16 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index 3a136bb..f58ca34 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -1,13 +1,16 @@ from typing import List, Optional, Sequence -import torch +import numpy as np +import torch as pt from rdkit import Chem from serenityff.charge.gnn.utils import CustomData, MolGraphConvFeaturizer from serenityff.charge.utils import Molecule -def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]: +def mols_from_sdf( + sdf_file: str, removeHs: Optional[bool] = False +) -> Sequence[Molecule]: """ Returns a Sequence of rdkit molecules read in from a .sdf file. @@ -21,6 +24,78 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) +def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor: + """Get atomic properties from an RDKit molecule object as a tensor. + + The property is expected to be a string of '|' separated numerical + values, one for each atom in the molecule. + + Parameters + ---------- + prop_name + The name of the property to retrieve from the molecule. + mol + The RDKit molecule object. + + Returns + ------- + pt.Tensor + The atomic properties converted to a PyTorch tensor. + + Raises + ------ + ValueError + If `prop_name` is None or if the property is not found in the molecule. + TypeError + If any of the parsed property values are NaN or not convertable to float. + """ + if prop_name is None: + raise ValueError("Property name can not be None when no_y == False.") + if not mol.HasProp(prop_name): + raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 + tensor = pt.tensor( + [float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float + ) + if pt.isnan(tensor).any(): + raise TypeError(f"Nan found in {prop_name}.") + return tensor + + +def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray: + """Get atomic properties from an RDKit molecule object as an array. + + The property is expected to be a string of '|' separated numerical + values, one for each atom in the molecule. + + Parameters + ---------- + prop_name + The name of the property to retrieve from the molecule. + mol + The RDKit molecule object. + + Returns + ------- + np.ndarray + The atomic properties converted to a NumPy array. + + Raises + ------ + ValueError + If `prop_name` is None or if the property is not found in the molecule. + TypeError + If any of the parsed property values are NaN or not convertable to float. + """ + if prop_name is None: + raise ValueError("Property name can not be None when no_y == False.") + if not mol.HasProp(prop_name): + raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 + array = np.array([float(x) for x in mol.GetProp(prop_name).split("|")]) + if np.isnan(array).any(): + raise TypeError(f"Nan found in {prop_name}.") + return array + + def get_graph_from_mol( mol: Molecule, index: int, @@ -67,31 +142,21 @@ def get_graph_from_mol( CustomData: pytorch geometric Data with .smiles as an extra attribute. """ - def get_mol_prop_as_torch_tensor(prop_name: Optional[str], mol: Molecule) -> torch.Tensor: - if prop_name is None: - raise ValueError("Property name can not be None when no_y == False.") - if not mol.HasProp(prop_name): - raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 - tensor = torch.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=torch.float) - if torch.isnan(tensor).any(): - raise TypeError(f"Nan found in {prop_name}.") - return tensor - grapher = MolGraphConvFeaturizer(use_edges=True) graph = grapher._featurize(mol, allowable_set).to_pyg_graph() if no_y: - graph.y = torch.tensor( + graph.y = pt.tensor( [0 for _ in mol.GetAtoms()], - dtype=torch.float, + dtype=pt.float, ) else: try: - graph.y = get_mol_prop_as_torch_tensor(sdf_property_name, mol) + graph.y = get_mol_prop_as_tensor(sdf_property_name, mol) except TypeError as exc: print(exc) return None - graph.batch = torch.tensor([0 for _ in mol.GetAtoms()], dtype=int) + graph.batch = pt.tensor([0 for _ in mol.GetAtoms()], dtype=int) graph.molecule_charge = Chem.GetFormalCharge(mol) graph.smiles = Chem.MolToSmiles(mol, canonical=True) graph.sdf_idx = index From c45673db2168a08d5d0101af70808e8bbac1d66b Mon Sep 17 00:00:00 2001 From: nmaeder Date: Wed, 8 Oct 2025 16:37:15 +0200 Subject: [PATCH 2/9] add tests for that --- tests/serenityff/charge/gnn/__init__.py | 0 tests/serenityff/charge/gnn/utils/__init__.py | 0 .../charge/gnn/utils/test_rdkit_helper.py | 96 +++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 tests/serenityff/charge/gnn/__init__.py create mode 100644 tests/serenityff/charge/gnn/utils/__init__.py create mode 100644 tests/serenityff/charge/gnn/utils/test_rdkit_helper.py diff --git a/tests/serenityff/charge/gnn/__init__.py b/tests/serenityff/charge/gnn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/serenityff/charge/gnn/utils/__init__.py b/tests/serenityff/charge/gnn/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py new file mode 100644 index 0000000..9ea367f --- /dev/null +++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest +import torch as pt +from rdkit import Chem + +from serenityff.charge.gnn.utils.rdkit_helper import ( + get_mol_prop_as_array, + get_mol_prop_as_tensor, +) + + [email protected] +def sample_mol_with_prop(): + """Fixture for a sample RDKit molecule with a valid property.""" + mol = Chem.MolFromSmiles("CCO") # Ethanol + mol.SetProp("test_prop", "1.0|2.5|-3.0") + return mol + + [email protected] +def sample_mol_with_nan_prop(): + """Fixture for a sample RDKit molecule with a property containing NaN.""" + mol = Chem.MolFromSmiles("CCO") + mol.SetProp("test_prop_nan", "1.0|nan|3.0") + return mol + + [email protected] +def sample_mol_missing_prop(): + """Fixture for a sample RDKit molecule without the desired property.""" + mol = Chem.MolFromSmiles("CCO") + return mol + + +def test_get_mol_prop_as_tensor_success(sample_mol_with_prop): + """Test successful retrieval of property as a tensor.""" + expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float) + result = get_mol_prop_as_tensor("test_prop", sample_mol_with_prop) + assert isinstance(result, pt.Tensor) + assert pt.equal(result, expected) + + +def test_get_mol_prop_as_tensor_raises_value_error_on_none_prop( + sample_mol_missing_prop, +): + """Test ValueError is raised when prop_name is None.""" + with pytest.raises(ValueError, match="Property name can not be None"): + get_mol_prop_as_tensor(None, sample_mol_missing_prop) + + +def test_get_mol_prop_as_tensor_raises_value_error_on_missing_prop( + sample_mol_missing_prop, +): + """Test ValueError is raised when the property is not found.""" + with pytest.raises(ValueError, match="Property missing_prop not found"): + get_mol_prop_as_tensor("missing_prop", sample_mol_missing_prop) + + +def test_get_mol_prop_as_tensor_raises_type_error_on_nan( + sample_mol_with_nan_prop, +): + """Test TypeError is raised when NaN is in the property string.""" + with pytest.raises(TypeError, match="Nan found in test_prop_nan"): + get_mol_prop_as_tensor("test_prop_nan", sample_mol_with_nan_prop) + + +def test_get_mol_prop_as_array_success(sample_mol_with_prop): + """Test successful retrieval of property as a numpy array.""" + expected = np.array([1.0, 2.5, -3.0]) + result = get_mol_prop_as_array("test_prop", sample_mol_with_prop) + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, expected) + + +def test_get_mol_prop_as_array_raises_value_error_on_none_prop( + sample_mol_missing_prop, +): + """Test ValueError is raised when prop_name is None.""" + with pytest.raises(ValueError, match="Property name can not be None"): + get_mol_prop_as_array(None, sample_mol_missing_prop) + + +def test_get_mol_prop_as_array_raises_value_error_on_missing_prop( + sample_mol_missing_prop, +): + """Test ValueError is raised when the property is not found.""" + with pytest.raises(ValueError, match="Property missing_prop not found"): + get_mol_prop_as_array("missing_prop", sample_mol_missing_prop) + + +def test_get_mol_prop_as_array_raises_type_error_on_nan( + sample_mol_with_nan_prop, +): + """Test TypeError is raised when NaN is in the property string.""" + with pytest.raises(TypeError, match="Nan found in test_prop_nan"): + get_mol_prop_as_array("test_prop_nan", sample_mol_with_nan_prop) From 1425421385af20803b1135266b4d3b1f79c8b0a1 Mon Sep 17 00:00:00 2001 From: nmaeder Date: Wed, 8 Oct 2025 17:14:16 +0200 Subject: [PATCH 3/9] run pre-commit --- serenityff/charge/gnn/utils/rdkit_helper.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index f58ca34..efd3b6e 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -8,9 +8,7 @@ from serenityff.charge.utils import Molecule -def mols_from_sdf( - sdf_file: str, removeHs: Optional[bool] = False -) -> Sequence[Molecule]: +def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]: """ Returns a Sequence of rdkit molecules read in from a .sdf file. @@ -53,9 +51,7 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor raise ValueError("Property name can not be None when no_y == False.") if not mol.HasProp(prop_name): raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 - tensor = pt.tensor( - [float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float - ) + tensor = pt.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float) if pt.isnan(tensor).any(): raise TypeError(f"Nan found in {prop_name}.") return tensor From 249debfd1b274192be02bf44ca4863065199b487 Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Thu, 16 Oct 2025 12:03:18 +0200 Subject: [PATCH 4/9] use np.fromstring and pt.from_numpy --- serenityff/charge/gnn/utils/rdkit_helper.py | 31 ++++++++------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index efd3b6e..afa6d1f 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -22,8 +22,8 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) -def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor: - """Get atomic properties from an RDKit molecule object as a tensor. +def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray: + """Get atomic properties from an RDKit molecule object as an array. The property is expected to be a string of '|' separated numerical values, one for each atom in the molecule. @@ -37,8 +37,8 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor Returns ------- - pt.Tensor - The atomic properties converted to a PyTorch tensor. + np.ndarray + The atomic properties converted to a NumPy array. Raises ------ @@ -51,14 +51,14 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor raise ValueError("Property name can not be None when no_y == False.") if not mol.HasProp(prop_name): raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 - tensor = pt.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float) - if pt.isnan(tensor).any(): + array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=float) + if np.isnan(array).any(): raise TypeError(f"Nan found in {prop_name}.") - return tensor + return array -def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray: - """Get atomic properties from an RDKit molecule object as an array. +def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor: + """Get atomic properties from an RDKit molecule object as a tensor. The property is expected to be a string of '|' separated numerical values, one for each atom in the molecule. @@ -72,8 +72,8 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray Returns ------- - np.ndarray - The atomic properties converted to a NumPy array. + pt.Tensor + The atomic properties converted to a PyTorch tensor. Raises ------ @@ -82,14 +82,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray TypeError If any of the parsed property values are NaN or not convertable to float. """ - if prop_name is None: - raise ValueError("Property name can not be None when no_y == False.") - if not mol.HasProp(prop_name): - raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 - array = np.array([float(x) for x in mol.GetProp(prop_name).split("|")]) - if np.isnan(array).any(): - raise TypeError(f"Nan found in {prop_name}.") - return array + return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol)) def get_graph_from_mol( From fdb0e1000fdff79408890f68e7d73a65dec6cea1 Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Thu, 16 Oct 2025 13:40:57 +0200 Subject: [PATCH 5/9] use correct dtype for torch tensor --- serenityff/charge/gnn/utils/rdkit_helper.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index afa6d1f..d8cc6c7 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -8,7 +8,9 @@ from serenityff.charge.utils import Molecule -def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]: +def mols_from_sdf( + sdf_file: str, removeHs: Optional[bool] = False +) -> Sequence[Molecule]: """ Returns a Sequence of rdkit molecules read in from a .sdf file. @@ -22,7 +24,9 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) -def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray: +def get_mol_prop_as_array( + prop_name: Optional[str], mol: Chem.Mol, dtype: type = float +) -> np.ndarray: """Get atomic properties from an RDKit molecule object as an array. The property is expected to be a string of '|' separated numerical @@ -51,7 +55,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray raise ValueError("Property name can not be None when no_y == False.") if not mol.HasProp(prop_name): raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 - array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=float) + array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=dtype) if np.isnan(array).any(): raise TypeError(f"Nan found in {prop_name}.") return array @@ -82,7 +86,9 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor TypeError If any of the parsed property values are NaN or not convertable to float. """ - return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol)) + return pt.from_numpy( + get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32) + ) def get_graph_from_mol( From ccb3880854a1ede0d009fa52757a2a5dd993492d Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Thu, 16 Oct 2025 13:54:58 +0200 Subject: [PATCH 6/9] run pre-commit --- serenityff/charge/gnn/utils/rdkit_helper.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index d8cc6c7..4a048cd 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -8,9 +8,7 @@ from serenityff.charge.utils import Molecule -def mols_from_sdf( - sdf_file: str, removeHs: Optional[bool] = False -) -> Sequence[Molecule]: +def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]: """ Returns a Sequence of rdkit molecules read in from a .sdf file. @@ -24,9 +22,7 @@ def mols_from_sdf( return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) -def get_mol_prop_as_array( - prop_name: Optional[str], mol: Chem.Mol, dtype: type = float -) -> np.ndarray: +def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray: """Get atomic properties from an RDKit molecule object as an array. The property is expected to be a string of '|' separated numerical @@ -86,9 +82,7 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor TypeError If any of the parsed property values are NaN or not convertable to float. """ - return pt.from_numpy( - get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32) - ) + return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32)) def get_graph_from_mol( From d95f0992025be16b19a18cbfe4d64bccaa66ead8 Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Wed, 22 Oct 2025 09:16:05 +0200 Subject: [PATCH 7/9] changes requested by marc-lehner --- serenityff/charge/gnn/utils/rdkit_helper.py | 14 ++++---- .../charge/gnn/utils/test_rdkit_helper.py | 36 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index 4a048cd..22c5ba3 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -22,7 +22,7 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) -def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray: +def get_mol_prop_as_np_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray: """Get atomic properties from an RDKit molecule object as an array. The property is expected to be a string of '|' separated numerical @@ -45,10 +45,10 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = ValueError If `prop_name` is None or if the property is not found in the molecule. TypeError - If any of the parsed property values are NaN or not convertable to float. + If any of the parsed property values are NaN or not convertible to float. """ if prop_name is None: - raise ValueError("Property name can not be None when no_y == False.") + raise ValueError("Property name can not be None.") if not mol.HasProp(prop_name): raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713 array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=dtype) @@ -57,7 +57,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = return array -def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor: +def get_mol_prop_as_pt_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor: """Get atomic properties from an RDKit molecule object as a tensor. The property is expected to be a string of '|' separated numerical @@ -80,9 +80,9 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor ValueError If `prop_name` is None or if the property is not found in the molecule. TypeError - If any of the parsed property values are NaN or not convertable to float. + If any of the parsed property values are NaN or not convertible to float. """ - return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32)) + return pt.from_numpy(get_mol_prop_as_np_array(prop_name=prop_name, mol=mol, dtype=np.float32)) def get_graph_from_mol( @@ -140,7 +140,7 @@ def get_graph_from_mol( ) else: try: - graph.y = get_mol_prop_as_tensor(sdf_property_name, mol) + graph.y = get_mol_prop_as_pt_tensor(sdf_property_name, mol) except TypeError as exc: print(exc) return None diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py index 9ea367f..1853533 100644 --- a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py +++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py @@ -4,8 +4,8 @@ from rdkit import Chem from serenityff.charge.gnn.utils.rdkit_helper import ( - get_mol_prop_as_array, - get_mol_prop_as_tensor, + get_mol_prop_as_np_array, + get_mol_prop_as_pt_tensor, ) @@ -32,65 +32,65 @@ def sample_mol_missing_prop(): return mol -def test_get_mol_prop_as_tensor_success(sample_mol_with_prop): +def test_get_mol_prop_as_pt_tensor_success(sample_mol_with_prop): """Test successful retrieval of property as a tensor.""" expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float) - result = get_mol_prop_as_tensor("test_prop", sample_mol_with_prop) + result = get_mol_prop_as_tpt_ensor("test_prop", sample_mol_with_prop) assert isinstance(result, pt.Tensor) assert pt.equal(result, expected) -def test_get_mol_prop_as_tensor_raises_value_error_on_none_prop( +def test_get_mol_prop_as_pt_tensor_raises_value_error_on_none_prop( sample_mol_missing_prop, ): """Test ValueError is raised when prop_name is None.""" with pytest.raises(ValueError, match="Property name can not be None"): - get_mol_prop_as_tensor(None, sample_mol_missing_prop) + get_mol_prop_as_pt_tensor(None, sample_mol_missing_prop) -def test_get_mol_prop_as_tensor_raises_value_error_on_missing_prop( +def test_get_mol_prop_as_pt_tensor_raises_value_error_on_missing_prop( sample_mol_missing_prop, ): """Test ValueError is raised when the property is not found.""" with pytest.raises(ValueError, match="Property missing_prop not found"): - get_mol_prop_as_tensor("missing_prop", sample_mol_missing_prop) + get_mol_prop_as_pt_tensor("missing_prop", sample_mol_missing_prop) -def test_get_mol_prop_as_tensor_raises_type_error_on_nan( +def test_get_mol_prop_as_pt_tensor_raises_type_error_on_nan( sample_mol_with_nan_prop, ): """Test TypeError is raised when NaN is in the property string.""" with pytest.raises(TypeError, match="Nan found in test_prop_nan"): - get_mol_prop_as_tensor("test_prop_nan", sample_mol_with_nan_prop) + get_mol_prop_as_pt_tensor("test_prop_nan", sample_mol_with_nan_prop) -def test_get_mol_prop_as_array_success(sample_mol_with_prop): +def test_get_mol_prop_as_np_array_success(sample_mol_with_prop): """Test successful retrieval of property as a numpy array.""" expected = np.array([1.0, 2.5, -3.0]) - result = get_mol_prop_as_array("test_prop", sample_mol_with_prop) + result = get_mol_prop_as_np_array("test_prop", sample_mol_with_prop) assert isinstance(result, np.ndarray) np.testing.assert_array_equal(result, expected) -def test_get_mol_prop_as_array_raises_value_error_on_none_prop( +def test_get_mol_prop_as_np_array_raises_value_error_on_none_prop( sample_mol_missing_prop, ): """Test ValueError is raised when prop_name is None.""" with pytest.raises(ValueError, match="Property name can not be None"): - get_mol_prop_as_array(None, sample_mol_missing_prop) + get_mol_prop_as_np_array(None, sample_mol_missing_prop) -def test_get_mol_prop_as_array_raises_value_error_on_missing_prop( +def test_get_mol_prop_as_np_array_raises_value_error_on_missing_prop( sample_mol_missing_prop, ): """Test ValueError is raised when the property is not found.""" with pytest.raises(ValueError, match="Property missing_prop not found"): - get_mol_prop_as_array("missing_prop", sample_mol_missing_prop) + get_mol_prop_as_np_array("missing_prop", sample_mol_missing_prop) -def test_get_mol_prop_as_array_raises_type_error_on_nan( +def test_get_mol_prop_as_np_array_raises_type_error_on_nan( sample_mol_with_nan_prop, ): """Test TypeError is raised when NaN is in the property string.""" with pytest.raises(TypeError, match="Nan found in test_prop_nan"): - get_mol_prop_as_array("test_prop_nan", sample_mol_with_nan_prop) + get_mol_prop_as_np_array("test_prop_nan", sample_mol_with_nan_prop) From 4b5ce6e766adfbd3c984ba8a4a0a7f9ffe73715d Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Wed, 22 Oct 2025 09:18:32 +0200 Subject: [PATCH 8/9] typo --- tests/serenityff/charge/gnn/utils/test_rdkit_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py index 1853533..624fbd5 100644 --- a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py +++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py @@ -35,7 +35,7 @@ def sample_mol_missing_prop(): def test_get_mol_prop_as_pt_tensor_success(sample_mol_with_prop): """Test successful retrieval of property as a tensor.""" expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float) - result = get_mol_prop_as_tpt_ensor("test_prop", sample_mol_with_prop) + result = get_mol_prop_as_pt_tensor("test_prop", sample_mol_with_prop) assert isinstance(result, pt.Tensor) assert pt.equal(result, expected) From d12cd8efb33b69aa7c988f66c512d45ee7ccd39b Mon Sep 17 00:00:00 2001 From: Niels Maeder Date: Thu, 23 Oct 2025 13:58:53 +0200 Subject: [PATCH 9/9] docstyle to sphinx --- serenityff/charge/gnn/utils/rdkit_helper.py | 111 +++++++------------- 1 file changed, 40 insertions(+), 71 deletions(-) diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py index 22c5ba3..1a56c6b 100644 --- a/serenityff/charge/gnn/utils/rdkit_helper.py +++ b/serenityff/charge/gnn/utils/rdkit_helper.py @@ -9,15 +9,11 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]: - """ - Returns a Sequence of rdkit molecules read in from a .sdf file. - - Args: - sdf_file (str): path to .sdf file. - removeHs (Optional[bool], optional): Wheter to remove Hydrogens. Defaults to False. + """Return a sequence of RDKit molecules read from an .sdf file. - Returns: - Sequence[Molecule]: rdkit mols. + :param sdf_file: Path to the .sdf file. + :param removeHs: Whether to remove hydrogens. Defaults to False. + :return: A sequence of RDKit molecule objects. """ return Chem.SDMolSupplier(sdf_file, removeHs=removeHs) @@ -28,24 +24,11 @@ def get_mol_prop_as_np_array(prop_name: Optional[str], mol: Chem.Mol, dtype: typ The property is expected to be a string of '|' separated numerical values, one for each atom in the molecule. - Parameters - ---------- - prop_name - The name of the property to retrieve from the molecule. - mol - The RDKit molecule object. - - Returns - ------- - np.ndarray - The atomic properties converted to a NumPy array. - - Raises - ------ - ValueError - If `prop_name` is None or if the property is not found in the molecule. - TypeError - If any of the parsed property values are NaN or not convertible to float. + :param prop_name: The name of the property to retrieve from the molecule. + :param mol: The RDKit molecule object. + :return: The atomic properties converted to a NumPy array. + :raises ValueError: If ``prop_name`` is None or if the property is not found in the molecule. + :raises TypeError: If any of the parsed property values are NaN or not convertible to float. """ if prop_name is None: raise ValueError("Property name can not be None.") @@ -63,24 +46,11 @@ def get_mol_prop_as_pt_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Ten The property is expected to be a string of '|' separated numerical values, one for each atom in the molecule. - Parameters - ---------- - prop_name - The name of the property to retrieve from the molecule. - mol - The RDKit molecule object. - - Returns - ------- - pt.Tensor - The atomic properties converted to a PyTorch tensor. - - Raises - ------ - ValueError - If `prop_name` is None or if the property is not found in the molecule. - TypeError - If any of the parsed property values are NaN or not convertible to float. + :param prop_name: The name of the property to retrieve from the molecule. + :param mol: The RDKit molecule object. + :return: The atomic properties converted to a PyTorch tensor. + :raises ValueError: If ``prop_name`` is None or if the property is not found in the molecule. + :raises TypeError: If any of the parsed property values are NaN or not convertible to float. """ return pt.from_numpy(get_mol_prop_as_np_array(prop_name=prop_name, mol=mol, dtype=np.float32)) @@ -103,34 +73,33 @@ def get_graph_from_mol( ], no_y: Optional[bool] = False, ) -> Optional[CustomData]: + """Create a PyTorch Geometric graph from an RDKit molecule. + + Returns ``None`` if the specified property is not found or contains NaN. + + The graph contains the following features: + + **Node features** + - Atom type (as specified in the `allowable_set`) + - Formal charge + - Hybridization + - H acceptor/donor + - Aromaticity + - Degree + + **Edge features** + - Bond type + - Is in ring + - Is conjugated + - Stereo information + + :param mol: The RDKit molecule. + :param sdf_property_name: Name of the property in the SDF file to be used for training. + :param allowable_set: List of atoms to include in the feature vector. Defaults to + ``["C", "N", "O", "F", "P", "S", "Cl", "Br", "I", "H"]``. + :return: A PyTorch Geometric ``Data`` object with an additional ``.smiles`` attribute, + or ``None`` if the property is invalid. """ - Creates an pytorch_geometric Graph from an rdkit molecule. - - Returns None if the property is not found or contains NaN. - The graph contains following features: - > Node Features: - > Atom Type (as specified in allowable set) - > formal_charge - > hybridization - > H acceptor_donor - > aromaticity - > degree - > Edge Features: - > Bond type - > is in ring - > is conjugated - > stereo - Args: - mol (Molecule): rdkit molecule - sdf_property_name (Optional[str]): Name of the property in the sdf file to be used for training. - allowable_set (Optional[List[str]], optional): List of atoms to be \ - included in the feature vector. Defaults to \ - [ "C", "N", "O", "F", "P", "S", "Cl", "Br", "I", "H", ]. - - Returns: - CustomData: pytorch geometric Data with .smiles as an extra attribute. - """ - grapher = MolGraphConvFeaturizer(use_edges=True) graph = grapher._featurize(mol, allowable_set).to_pyg_graph() if no_y: