From 796ae3fdf9f272422baffb176c4822ae2ecb612d Mon Sep 17 00:00:00 2001
From: nmaeder
Date: Wed, 8 Oct 2025 16:36:42 +0200
Subject: [PATCH 1/9] extract functions that could be used elsewhere
---
serenityff/charge/gnn/utils/rdkit_helper.py | 97 +++++++++++++++++----
1 file changed, 81 insertions(+), 16 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index 3a136bb..f58ca34 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -1,13 +1,16 @@
from typing import List, Optional, Sequence
-import torch
+import numpy as np
+import torch as pt
from rdkit import Chem
from serenityff.charge.gnn.utils import CustomData, MolGraphConvFeaturizer
from serenityff.charge.utils import Molecule
-def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]:
+def mols_from_sdf(
+ sdf_file: str, removeHs: Optional[bool] = False
+) -> Sequence[Molecule]:
"""
Returns a Sequence of rdkit molecules read in from a .sdf file.
@@ -21,6 +24,78 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
+def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor:
+ """Get atomic properties from an RDKit molecule object as a tensor.
+
+ The property is expected to be a string of '|' separated numerical
+ values, one for each atom in the molecule.
+
+ Parameters
+ ----------
+ prop_name
+ The name of the property to retrieve from the molecule.
+ mol
+ The RDKit molecule object.
+
+ Returns
+ -------
+ pt.Tensor
+ The atomic properties converted to a PyTorch tensor.
+
+ Raises
+ ------
+ ValueError
+ If `prop_name` is None or if the property is not found in the molecule.
+ TypeError
+ If any of the parsed property values are NaN or not convertable to float.
+ """
+ if prop_name is None:
+ raise ValueError("Property name can not be None when no_y == False.")
+ if not mol.HasProp(prop_name):
+ raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
+ tensor = pt.tensor(
+ [float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float
+ )
+ if pt.isnan(tensor).any():
+ raise TypeError(f"Nan found in {prop_name}.")
+ return tensor
+
+
+def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray:
+ """Get atomic properties from an RDKit molecule object as an array.
+
+ The property is expected to be a string of '|' separated numerical
+ values, one for each atom in the molecule.
+
+ Parameters
+ ----------
+ prop_name
+ The name of the property to retrieve from the molecule.
+ mol
+ The RDKit molecule object.
+
+ Returns
+ -------
+ np.ndarray
+ The atomic properties converted to a NumPy array.
+
+ Raises
+ ------
+ ValueError
+ If `prop_name` is None or if the property is not found in the molecule.
+ TypeError
+ If any of the parsed property values are NaN or not convertable to float.
+ """
+ if prop_name is None:
+ raise ValueError("Property name can not be None when no_y == False.")
+ if not mol.HasProp(prop_name):
+ raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
+ array = np.array([float(x) for x in mol.GetProp(prop_name).split("|")])
+ if np.isnan(array).any():
+ raise TypeError(f"Nan found in {prop_name}.")
+ return array
+
+
def get_graph_from_mol(
mol: Molecule,
index: int,
@@ -67,31 +142,21 @@ def get_graph_from_mol(
CustomData: pytorch geometric Data with .smiles as an extra attribute.
"""
- def get_mol_prop_as_torch_tensor(prop_name: Optional[str], mol: Molecule) -> torch.Tensor:
- if prop_name is None:
- raise ValueError("Property name can not be None when no_y == False.")
- if not mol.HasProp(prop_name):
- raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
- tensor = torch.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=torch.float)
- if torch.isnan(tensor).any():
- raise TypeError(f"Nan found in {prop_name}.")
- return tensor
-
grapher = MolGraphConvFeaturizer(use_edges=True)
graph = grapher._featurize(mol, allowable_set).to_pyg_graph()
if no_y:
- graph.y = torch.tensor(
+ graph.y = pt.tensor(
[0 for _ in mol.GetAtoms()],
- dtype=torch.float,
+ dtype=pt.float,
)
else:
try:
- graph.y = get_mol_prop_as_torch_tensor(sdf_property_name, mol)
+ graph.y = get_mol_prop_as_tensor(sdf_property_name, mol)
except TypeError as exc:
print(exc)
return None
- graph.batch = torch.tensor([0 for _ in mol.GetAtoms()], dtype=int)
+ graph.batch = pt.tensor([0 for _ in mol.GetAtoms()], dtype=int)
graph.molecule_charge = Chem.GetFormalCharge(mol)
graph.smiles = Chem.MolToSmiles(mol, canonical=True)
graph.sdf_idx = index
From c45673db2168a08d5d0101af70808e8bbac1d66b Mon Sep 17 00:00:00 2001
From: nmaeder
Date: Wed, 8 Oct 2025 16:37:15 +0200
Subject: [PATCH 2/9] add tests for that
---
tests/serenityff/charge/gnn/__init__.py | 0
tests/serenityff/charge/gnn/utils/__init__.py | 0
.../charge/gnn/utils/test_rdkit_helper.py | 96 +++++++++++++++++++
3 files changed, 96 insertions(+)
create mode 100644 tests/serenityff/charge/gnn/__init__.py
create mode 100644 tests/serenityff/charge/gnn/utils/__init__.py
create mode 100644 tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
diff --git a/tests/serenityff/charge/gnn/__init__.py b/tests/serenityff/charge/gnn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/serenityff/charge/gnn/utils/__init__.py b/tests/serenityff/charge/gnn/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
new file mode 100644
index 0000000..9ea367f
--- /dev/null
+++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
@@ -0,0 +1,96 @@
+import numpy as np
+import pytest
+import torch as pt
+from rdkit import Chem
+
+from serenityff.charge.gnn.utils.rdkit_helper import (
+ get_mol_prop_as_array,
+ get_mol_prop_as_tensor,
+)
+
+
[email protected]
+def sample_mol_with_prop():
+ """Fixture for a sample RDKit molecule with a valid property."""
+ mol = Chem.MolFromSmiles("CCO") # Ethanol
+ mol.SetProp("test_prop", "1.0|2.5|-3.0")
+ return mol
+
+
[email protected]
+def sample_mol_with_nan_prop():
+ """Fixture for a sample RDKit molecule with a property containing NaN."""
+ mol = Chem.MolFromSmiles("CCO")
+ mol.SetProp("test_prop_nan", "1.0|nan|3.0")
+ return mol
+
+
[email protected]
+def sample_mol_missing_prop():
+ """Fixture for a sample RDKit molecule without the desired property."""
+ mol = Chem.MolFromSmiles("CCO")
+ return mol
+
+
+def test_get_mol_prop_as_tensor_success(sample_mol_with_prop):
+ """Test successful retrieval of property as a tensor."""
+ expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float)
+ result = get_mol_prop_as_tensor("test_prop", sample_mol_with_prop)
+ assert isinstance(result, pt.Tensor)
+ assert pt.equal(result, expected)
+
+
+def test_get_mol_prop_as_tensor_raises_value_error_on_none_prop(
+ sample_mol_missing_prop,
+):
+ """Test ValueError is raised when prop_name is None."""
+ with pytest.raises(ValueError, match="Property name can not be None"):
+ get_mol_prop_as_tensor(None, sample_mol_missing_prop)
+
+
+def test_get_mol_prop_as_tensor_raises_value_error_on_missing_prop(
+ sample_mol_missing_prop,
+):
+ """Test ValueError is raised when the property is not found."""
+ with pytest.raises(ValueError, match="Property missing_prop not found"):
+ get_mol_prop_as_tensor("missing_prop", sample_mol_missing_prop)
+
+
+def test_get_mol_prop_as_tensor_raises_type_error_on_nan(
+ sample_mol_with_nan_prop,
+):
+ """Test TypeError is raised when NaN is in the property string."""
+ with pytest.raises(TypeError, match="Nan found in test_prop_nan"):
+ get_mol_prop_as_tensor("test_prop_nan", sample_mol_with_nan_prop)
+
+
+def test_get_mol_prop_as_array_success(sample_mol_with_prop):
+ """Test successful retrieval of property as a numpy array."""
+ expected = np.array([1.0, 2.5, -3.0])
+ result = get_mol_prop_as_array("test_prop", sample_mol_with_prop)
+ assert isinstance(result, np.ndarray)
+ np.testing.assert_array_equal(result, expected)
+
+
+def test_get_mol_prop_as_array_raises_value_error_on_none_prop(
+ sample_mol_missing_prop,
+):
+ """Test ValueError is raised when prop_name is None."""
+ with pytest.raises(ValueError, match="Property name can not be None"):
+ get_mol_prop_as_array(None, sample_mol_missing_prop)
+
+
+def test_get_mol_prop_as_array_raises_value_error_on_missing_prop(
+ sample_mol_missing_prop,
+):
+ """Test ValueError is raised when the property is not found."""
+ with pytest.raises(ValueError, match="Property missing_prop not found"):
+ get_mol_prop_as_array("missing_prop", sample_mol_missing_prop)
+
+
+def test_get_mol_prop_as_array_raises_type_error_on_nan(
+ sample_mol_with_nan_prop,
+):
+ """Test TypeError is raised when NaN is in the property string."""
+ with pytest.raises(TypeError, match="Nan found in test_prop_nan"):
+ get_mol_prop_as_array("test_prop_nan", sample_mol_with_nan_prop)
From 1425421385af20803b1135266b4d3b1f79c8b0a1 Mon Sep 17 00:00:00 2001
From: nmaeder
Date: Wed, 8 Oct 2025 17:14:16 +0200
Subject: [PATCH 3/9] run pre-commit
---
serenityff/charge/gnn/utils/rdkit_helper.py | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index f58ca34..efd3b6e 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -8,9 +8,7 @@
from serenityff.charge.utils import Molecule
-def mols_from_sdf(
- sdf_file: str, removeHs: Optional[bool] = False
-) -> Sequence[Molecule]:
+def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]:
"""
Returns a Sequence of rdkit molecules read in from a .sdf file.
@@ -53,9 +51,7 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
raise ValueError("Property name can not be None when no_y == False.")
if not mol.HasProp(prop_name):
raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
- tensor = pt.tensor(
- [float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float
- )
+ tensor = pt.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float)
if pt.isnan(tensor).any():
raise TypeError(f"Nan found in {prop_name}.")
return tensor
From 249debfd1b274192be02bf44ca4863065199b487 Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Thu, 16 Oct 2025 12:03:18 +0200
Subject: [PATCH 4/9] use np.fromstring and pt.from_numpy
---
serenityff/charge/gnn/utils/rdkit_helper.py | 31 ++++++++-------------
1 file changed, 12 insertions(+), 19 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index efd3b6e..afa6d1f 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -22,8 +22,8 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
-def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor:
- """Get atomic properties from an RDKit molecule object as a tensor.
+def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray:
+ """Get atomic properties from an RDKit molecule object as an array.
The property is expected to be a string of '|' separated numerical
values, one for each atom in the molecule.
@@ -37,8 +37,8 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
Returns
-------
- pt.Tensor
- The atomic properties converted to a PyTorch tensor.
+ np.ndarray
+ The atomic properties converted to a NumPy array.
Raises
------
@@ -51,14 +51,14 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
raise ValueError("Property name can not be None when no_y == False.")
if not mol.HasProp(prop_name):
raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
- tensor = pt.tensor([float(x) for x in mol.GetProp(prop_name).split("|")], dtype=pt.float)
- if pt.isnan(tensor).any():
+ array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=float)
+ if np.isnan(array).any():
raise TypeError(f"Nan found in {prop_name}.")
- return tensor
+ return array
-def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray:
- """Get atomic properties from an RDKit molecule object as an array.
+def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor:
+ """Get atomic properties from an RDKit molecule object as a tensor.
The property is expected to be a string of '|' separated numerical
values, one for each atom in the molecule.
@@ -72,8 +72,8 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray
Returns
-------
- np.ndarray
- The atomic properties converted to a NumPy array.
+ pt.Tensor
+ The atomic properties converted to a PyTorch tensor.
Raises
------
@@ -82,14 +82,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray
TypeError
If any of the parsed property values are NaN or not convertable to float.
"""
- if prop_name is None:
- raise ValueError("Property name can not be None when no_y == False.")
- if not mol.HasProp(prop_name):
- raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
- array = np.array([float(x) for x in mol.GetProp(prop_name).split("|")])
- if np.isnan(array).any():
- raise TypeError(f"Nan found in {prop_name}.")
- return array
+ return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol))
def get_graph_from_mol(
From fdb0e1000fdff79408890f68e7d73a65dec6cea1 Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Thu, 16 Oct 2025 13:40:57 +0200
Subject: [PATCH 5/9] use correct dtype for torch tensor
---
serenityff/charge/gnn/utils/rdkit_helper.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index afa6d1f..d8cc6c7 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -8,7 +8,9 @@
from serenityff.charge.utils import Molecule
-def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]:
+def mols_from_sdf(
+ sdf_file: str, removeHs: Optional[bool] = False
+) -> Sequence[Molecule]:
"""
Returns a Sequence of rdkit molecules read in from a .sdf file.
@@ -22,7 +24,9 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
-def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray:
+def get_mol_prop_as_array(
+ prop_name: Optional[str], mol: Chem.Mol, dtype: type = float
+) -> np.ndarray:
"""Get atomic properties from an RDKit molecule object as an array.
The property is expected to be a string of '|' separated numerical
@@ -51,7 +55,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol) -> np.ndarray
raise ValueError("Property name can not be None when no_y == False.")
if not mol.HasProp(prop_name):
raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
- array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=float)
+ array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=dtype)
if np.isnan(array).any():
raise TypeError(f"Nan found in {prop_name}.")
return array
@@ -82,7 +86,9 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
TypeError
If any of the parsed property values are NaN or not convertable to float.
"""
- return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol))
+ return pt.from_numpy(
+ get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32)
+ )
def get_graph_from_mol(
From ccb3880854a1ede0d009fa52757a2a5dd993492d Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Thu, 16 Oct 2025 13:54:58 +0200
Subject: [PATCH 6/9] run pre-commit
---
serenityff/charge/gnn/utils/rdkit_helper.py | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index d8cc6c7..4a048cd 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -8,9 +8,7 @@
from serenityff.charge.utils import Molecule
-def mols_from_sdf(
- sdf_file: str, removeHs: Optional[bool] = False
-) -> Sequence[Molecule]:
+def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]:
"""
Returns a Sequence of rdkit molecules read in from a .sdf file.
@@ -24,9 +22,7 @@ def mols_from_sdf(
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
-def get_mol_prop_as_array(
- prop_name: Optional[str], mol: Chem.Mol, dtype: type = float
-) -> np.ndarray:
+def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray:
"""Get atomic properties from an RDKit molecule object as an array.
The property is expected to be a string of '|' separated numerical
@@ -86,9 +82,7 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
TypeError
If any of the parsed property values are NaN or not convertable to float.
"""
- return pt.from_numpy(
- get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32)
- )
+ return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32))
def get_graph_from_mol(
From d95f0992025be16b19a18cbfe4d64bccaa66ead8 Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Wed, 22 Oct 2025 09:16:05 +0200
Subject: [PATCH 7/9] changes requested by marc-lehner
---
serenityff/charge/gnn/utils/rdkit_helper.py | 14 ++++----
.../charge/gnn/utils/test_rdkit_helper.py | 36 +++++++++----------
2 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index 4a048cd..22c5ba3 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -22,7 +22,7 @@ def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[M
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
-def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray:
+def get_mol_prop_as_np_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type = float) -> np.ndarray:
"""Get atomic properties from an RDKit molecule object as an array.
The property is expected to be a string of '|' separated numerical
@@ -45,10 +45,10 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type =
ValueError
If `prop_name` is None or if the property is not found in the molecule.
TypeError
- If any of the parsed property values are NaN or not convertable to float.
+ If any of the parsed property values are NaN or not convertible to float.
"""
if prop_name is None:
- raise ValueError("Property name can not be None when no_y == False.")
+ raise ValueError("Property name can not be None.")
if not mol.HasProp(prop_name):
raise ValueError(f"Property {prop_name} not found in molecule.") # noqa E713
array = np.fromstring(mol.GetProp(prop_name), sep="|", dtype=dtype)
@@ -57,7 +57,7 @@ def get_mol_prop_as_array(prop_name: Optional[str], mol: Chem.Mol, dtype: type =
return array
-def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor:
+def get_mol_prop_as_pt_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor:
"""Get atomic properties from an RDKit molecule object as a tensor.
The property is expected to be a string of '|' separated numerical
@@ -80,9 +80,9 @@ def get_mol_prop_as_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Tensor
ValueError
If `prop_name` is None or if the property is not found in the molecule.
TypeError
- If any of the parsed property values are NaN or not convertable to float.
+ If any of the parsed property values are NaN or not convertible to float.
"""
- return pt.from_numpy(get_mol_prop_as_array(prop_name=prop_name, mol=mol, dtype=np.float32))
+ return pt.from_numpy(get_mol_prop_as_np_array(prop_name=prop_name, mol=mol, dtype=np.float32))
def get_graph_from_mol(
@@ -140,7 +140,7 @@ def get_graph_from_mol(
)
else:
try:
- graph.y = get_mol_prop_as_tensor(sdf_property_name, mol)
+ graph.y = get_mol_prop_as_pt_tensor(sdf_property_name, mol)
except TypeError as exc:
print(exc)
return None
diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
index 9ea367f..1853533 100644
--- a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
+++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
@@ -4,8 +4,8 @@
from rdkit import Chem
from serenityff.charge.gnn.utils.rdkit_helper import (
- get_mol_prop_as_array,
- get_mol_prop_as_tensor,
+ get_mol_prop_as_np_array,
+ get_mol_prop_as_pt_tensor,
)
@@ -32,65 +32,65 @@ def sample_mol_missing_prop():
return mol
-def test_get_mol_prop_as_tensor_success(sample_mol_with_prop):
+def test_get_mol_prop_as_pt_tensor_success(sample_mol_with_prop):
"""Test successful retrieval of property as a tensor."""
expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float)
- result = get_mol_prop_as_tensor("test_prop", sample_mol_with_prop)
+ result = get_mol_prop_as_tpt_ensor("test_prop", sample_mol_with_prop)
assert isinstance(result, pt.Tensor)
assert pt.equal(result, expected)
-def test_get_mol_prop_as_tensor_raises_value_error_on_none_prop(
+def test_get_mol_prop_as_pt_tensor_raises_value_error_on_none_prop(
sample_mol_missing_prop,
):
"""Test ValueError is raised when prop_name is None."""
with pytest.raises(ValueError, match="Property name can not be None"):
- get_mol_prop_as_tensor(None, sample_mol_missing_prop)
+ get_mol_prop_as_pt_tensor(None, sample_mol_missing_prop)
-def test_get_mol_prop_as_tensor_raises_value_error_on_missing_prop(
+def test_get_mol_prop_as_pt_tensor_raises_value_error_on_missing_prop(
sample_mol_missing_prop,
):
"""Test ValueError is raised when the property is not found."""
with pytest.raises(ValueError, match="Property missing_prop not found"):
- get_mol_prop_as_tensor("missing_prop", sample_mol_missing_prop)
+ get_mol_prop_as_pt_tensor("missing_prop", sample_mol_missing_prop)
-def test_get_mol_prop_as_tensor_raises_type_error_on_nan(
+def test_get_mol_prop_as_pt_tensor_raises_type_error_on_nan(
sample_mol_with_nan_prop,
):
"""Test TypeError is raised when NaN is in the property string."""
with pytest.raises(TypeError, match="Nan found in test_prop_nan"):
- get_mol_prop_as_tensor("test_prop_nan", sample_mol_with_nan_prop)
+ get_mol_prop_as_pt_tensor("test_prop_nan", sample_mol_with_nan_prop)
-def test_get_mol_prop_as_array_success(sample_mol_with_prop):
+def test_get_mol_prop_as_np_array_success(sample_mol_with_prop):
"""Test successful retrieval of property as a numpy array."""
expected = np.array([1.0, 2.5, -3.0])
- result = get_mol_prop_as_array("test_prop", sample_mol_with_prop)
+ result = get_mol_prop_as_np_array("test_prop", sample_mol_with_prop)
assert isinstance(result, np.ndarray)
np.testing.assert_array_equal(result, expected)
-def test_get_mol_prop_as_array_raises_value_error_on_none_prop(
+def test_get_mol_prop_as_np_array_raises_value_error_on_none_prop(
sample_mol_missing_prop,
):
"""Test ValueError is raised when prop_name is None."""
with pytest.raises(ValueError, match="Property name can not be None"):
- get_mol_prop_as_array(None, sample_mol_missing_prop)
+ get_mol_prop_as_np_array(None, sample_mol_missing_prop)
-def test_get_mol_prop_as_array_raises_value_error_on_missing_prop(
+def test_get_mol_prop_as_np_array_raises_value_error_on_missing_prop(
sample_mol_missing_prop,
):
"""Test ValueError is raised when the property is not found."""
with pytest.raises(ValueError, match="Property missing_prop not found"):
- get_mol_prop_as_array("missing_prop", sample_mol_missing_prop)
+ get_mol_prop_as_np_array("missing_prop", sample_mol_missing_prop)
-def test_get_mol_prop_as_array_raises_type_error_on_nan(
+def test_get_mol_prop_as_np_array_raises_type_error_on_nan(
sample_mol_with_nan_prop,
):
"""Test TypeError is raised when NaN is in the property string."""
with pytest.raises(TypeError, match="Nan found in test_prop_nan"):
- get_mol_prop_as_array("test_prop_nan", sample_mol_with_nan_prop)
+ get_mol_prop_as_np_array("test_prop_nan", sample_mol_with_nan_prop)
From 4b5ce6e766adfbd3c984ba8a4a0a7f9ffe73715d Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Wed, 22 Oct 2025 09:18:32 +0200
Subject: [PATCH 8/9] typo
---
tests/serenityff/charge/gnn/utils/test_rdkit_helper.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
index 1853533..624fbd5 100644
--- a/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
+++ b/tests/serenityff/charge/gnn/utils/test_rdkit_helper.py
@@ -35,7 +35,7 @@ def sample_mol_missing_prop():
def test_get_mol_prop_as_pt_tensor_success(sample_mol_with_prop):
"""Test successful retrieval of property as a tensor."""
expected = pt.tensor([1.0, 2.5, -3.0], dtype=pt.float)
- result = get_mol_prop_as_tpt_ensor("test_prop", sample_mol_with_prop)
+ result = get_mol_prop_as_pt_tensor("test_prop", sample_mol_with_prop)
assert isinstance(result, pt.Tensor)
assert pt.equal(result, expected)
From d12cd8efb33b69aa7c988f66c512d45ee7ccd39b Mon Sep 17 00:00:00 2001
From: Niels Maeder
Date: Thu, 23 Oct 2025 13:58:53 +0200
Subject: [PATCH 9/9] docstyle to sphinx
---
serenityff/charge/gnn/utils/rdkit_helper.py | 111 +++++++-------------
1 file changed, 40 insertions(+), 71 deletions(-)
diff --git a/serenityff/charge/gnn/utils/rdkit_helper.py b/serenityff/charge/gnn/utils/rdkit_helper.py
index 22c5ba3..1a56c6b 100644
--- a/serenityff/charge/gnn/utils/rdkit_helper.py
+++ b/serenityff/charge/gnn/utils/rdkit_helper.py
@@ -9,15 +9,11 @@
def mols_from_sdf(sdf_file: str, removeHs: Optional[bool] = False) -> Sequence[Molecule]:
- """
- Returns a Sequence of rdkit molecules read in from a .sdf file.
-
- Args:
- sdf_file (str): path to .sdf file.
- removeHs (Optional[bool], optional): Wheter to remove Hydrogens. Defaults to False.
+ """Return a sequence of RDKit molecules read from an .sdf file.
- Returns:
- Sequence[Molecule]: rdkit mols.
+ :param sdf_file: Path to the .sdf file.
+ :param removeHs: Whether to remove hydrogens. Defaults to False.
+ :return: A sequence of RDKit molecule objects.
"""
return Chem.SDMolSupplier(sdf_file, removeHs=removeHs)
@@ -28,24 +24,11 @@ def get_mol_prop_as_np_array(prop_name: Optional[str], mol: Chem.Mol, dtype: typ
The property is expected to be a string of '|' separated numerical
values, one for each atom in the molecule.
- Parameters
- ----------
- prop_name
- The name of the property to retrieve from the molecule.
- mol
- The RDKit molecule object.
-
- Returns
- -------
- np.ndarray
- The atomic properties converted to a NumPy array.
-
- Raises
- ------
- ValueError
- If `prop_name` is None or if the property is not found in the molecule.
- TypeError
- If any of the parsed property values are NaN or not convertible to float.
+ :param prop_name: The name of the property to retrieve from the molecule.
+ :param mol: The RDKit molecule object.
+ :return: The atomic properties converted to a NumPy array.
+ :raises ValueError: If ``prop_name`` is None or if the property is not found in the molecule.
+ :raises TypeError: If any of the parsed property values are NaN or not convertible to float.
"""
if prop_name is None:
raise ValueError("Property name can not be None.")
@@ -63,24 +46,11 @@ def get_mol_prop_as_pt_tensor(prop_name: Optional[str], mol: Chem.Mol) -> pt.Ten
The property is expected to be a string of '|' separated numerical
values, one for each atom in the molecule.
- Parameters
- ----------
- prop_name
- The name of the property to retrieve from the molecule.
- mol
- The RDKit molecule object.
-
- Returns
- -------
- pt.Tensor
- The atomic properties converted to a PyTorch tensor.
-
- Raises
- ------
- ValueError
- If `prop_name` is None or if the property is not found in the molecule.
- TypeError
- If any of the parsed property values are NaN or not convertible to float.
+ :param prop_name: The name of the property to retrieve from the molecule.
+ :param mol: The RDKit molecule object.
+ :return: The atomic properties converted to a PyTorch tensor.
+ :raises ValueError: If ``prop_name`` is None or if the property is not found in the molecule.
+ :raises TypeError: If any of the parsed property values are NaN or not convertible to float.
"""
return pt.from_numpy(get_mol_prop_as_np_array(prop_name=prop_name, mol=mol, dtype=np.float32))
@@ -103,34 +73,33 @@ def get_graph_from_mol(
],
no_y: Optional[bool] = False,
) -> Optional[CustomData]:
+ """Create a PyTorch Geometric graph from an RDKit molecule.
+
+ Returns ``None`` if the specified property is not found or contains NaN.
+
+ The graph contains the following features:
+
+ **Node features**
+ - Atom type (as specified in the `allowable_set`)
+ - Formal charge
+ - Hybridization
+ - H acceptor/donor
+ - Aromaticity
+ - Degree
+
+ **Edge features**
+ - Bond type
+ - Is in ring
+ - Is conjugated
+ - Stereo information
+
+ :param mol: The RDKit molecule.
+ :param sdf_property_name: Name of the property in the SDF file to be used for training.
+ :param allowable_set: List of atoms to include in the feature vector. Defaults to
+ ``["C", "N", "O", "F", "P", "S", "Cl", "Br", "I", "H"]``.
+ :return: A PyTorch Geometric ``Data`` object with an additional ``.smiles`` attribute,
+ or ``None`` if the property is invalid.
"""
- Creates an pytorch_geometric Graph from an rdkit molecule.
-
- Returns None if the property is not found or contains NaN.
- The graph contains following features:
- > Node Features:
- > Atom Type (as specified in allowable set)
- > formal_charge
- > hybridization
- > H acceptor_donor
- > aromaticity
- > degree
- > Edge Features:
- > Bond type
- > is in ring
- > is conjugated
- > stereo
- Args:
- mol (Molecule): rdkit molecule
- sdf_property_name (Optional[str]): Name of the property in the sdf file to be used for training.
- allowable_set (Optional[List[str]], optional): List of atoms to be \
- included in the feature vector. Defaults to \
- [ "C", "N", "O", "F", "P", "S", "Cl", "Br", "I", "H", ].
-
- Returns:
- CustomData: pytorch geometric Data with .smiles as an extra attribute.
- """
-
grapher = MolGraphConvFeaturizer(use_edges=True)
graph = grapher._featurize(mol, allowable_set).to_pyg_graph()
if no_y: