| import numpy as np |
| import torch |
| from rdkit import Chem |
| from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator |
| import selfies as sf |
| from torch_geometric.data import Data |
| from torch_geometric.loader import DataLoader |
|
|
| mfpgen = GetMorganGenerator( |
| radius=2, |
| countSimulation=False, |
| includeChirality=False, |
| useBondTypes=True, |
| onlyNonzeroInvariants=False, |
| includeRingMembership=True, |
| countBounds=None, |
| fpSize=2048, |
| atomInvariantsGenerator=None, |
| bondInvariantsGenerator=None, |
| includeRedundantEnvironments=False, |
| ) |
|
|
|
|
| def smiles_to_graph(smiles): |
| mol = Chem.MolFromSmiles(smiles) |
| if mol is None: |
| return None |
| return mol_to_graph(mol) |
|
|
|
|
| def selfies_to_graph(smiles_string): |
| try: |
| selfies_string = sf.encoder(smiles_string) |
| smiles = sf.decoder(selfies_string) |
| mol = Chem.MolFromSmiles(smiles) |
| if mol is None: |
| raise ValueError("Decoded SELFIES is invalid") |
| return mol_to_graph(mol) |
| except Exception: |
| fallback = smiles_to_graph(smiles_string) |
| if fallback is None: |
| return None |
| return fallback |
|
|
|
|
| def ecfp_to_graph(smiles_str: str, max_bits: int = 2048, k: int = 2) -> Data | None: |
| mol = Chem.MolFromSmiles(smiles_str) |
| if mol is None: |
| return None |
| fp = mfpgen.GetFingerprintAsNumPy(mol) |
| active_bits = np.nonzero(fp)[0] |
| n = len(active_bits) |
| if n == 0: |
| return None |
| edge_index = [] |
| for i in range(n): |
| for j in range(i + 1, min(i + 1 + k, n)): |
| edge_index.append([i, j]) |
| edge_index.append([j, i]) |
| edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() |
| x = torch.zeros((n, max_bits), dtype=torch.float) |
| for i, bit_idx in enumerate(active_bits): |
| x[i, bit_idx] = 1.0 |
| return Data(x=x, edge_index=edge_index) |
|
|
|
|
| def mol_to_graph(mol): |
| atom_feats = [] |
| for atom in mol.GetAtoms(): |
| atom_feats.append( |
| [ |
| atom.GetAtomicNum(), |
| atom.GetDegree(), |
| atom.GetFormalCharge(), |
| atom.GetIdx(), |
| ] |
| ) |
| x = torch.tensor(atom_feats, dtype=torch.float) |
|
|
| edge_index = [] |
| edge_attr = [] |
| for bond in mol.GetBonds(): |
| i = bond.GetBeginAtomIdx() |
| j = bond.GetEndAtomIdx() |
| edge_index.append((i, j)) |
| edge_index.append((j, i)) |
| btype = bond.GetBondTypeAsDouble() |
| edge_attr.append([btype]) |
| edge_attr.append([btype]) |
|
|
| edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() |
| edge_attr = torch.tensor(edge_attr, dtype=torch.float) |
| return Data(x=x, edge_index=edge_index, edge_attr=edge_attr) |
|
|
|
|
| def smiles_for_gp(smiles: str) -> np.ndarray: |
| mol = Chem.MolFromSmiles(smiles) |
| if mol is None: |
| return np.zeros(mfpgen.GetNumBits(), dtype=np.float32) |
| arr = mfpgen.GetFingerprintAsNumPy(mol) |
| return arr.astype(np.float32) |
|
|
|
|
| def selfies_for_gp(selfies_str, radius=2, n_bits=2048): |
| try: |
| smiles = sf.decoder(selfies_str) |
| assert isinstance(smiles, str) |
| return smiles_for_gp(smiles) |
| except: |
| return np.zeros(n_bits) |
|
|
|
|
| def ecfp_for_gp(smiles_str: str) -> np.ndarray: |
| mol = Chem.MolFromSmiles(smiles_str) |
| if mol is None: |
| return np.zeros(mfpgen.GetNumBits(), dtype=np.float32) |
| return mfpgen.GetFingerprintAsNumPy(mol).astype(np.float32) |
|
|
|
|
| def graph_native_loader(graph_list, batch_size=32, shuffle=True): |
| return DataLoader(graph_list, batch_size=batch_size, shuffle=shuffle) |
|
|