Skip to content

EnrichmentWithPubChem

Enrichment class for enriching PubChem IDs with their STRINGS representation and descriptions.

EnrichmentWithPubChem

Bases: Enrichments

Enrichment class using PubChem

Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class EnrichmentWithPubChem(Enrichments):
    """
    Enrichment class using PubChem
    """
    def enrich_documents(self, texts: List[str]) -> List[str]:
        """
        Enrich a list of input PubChem IDs with their STRINGS representation.

        Args:
            texts: The list of pubchem IDs to be enriched.

        Returns:
            The list of enriched STRINGS and their descriptions.
        """

        enriched_pubchem_ids_smiles = []
        enriched_pubchem_ids_descriptions = []

        # Load Hydra configuration to get the base URL for PubChem
        with hydra.initialize(version_base=None, config_path="../../configs"):
            cfg = hydra.compose(config_name='config',
                                overrides=['utils/pubchem_utils=default'])
            cfg = cfg.utils.pubchem_utils
        # Iterate over each PubChem ID in the input list
        pubchem_cids = texts
        for pubchem_cid in pubchem_cids:
            # Prepare the URL
            pubchem_url = f"{cfg.pubchem_cid2smiles_url}/{pubchem_cid}/property/smiles/JSON"
            # Get the data
            response = requests.get(pubchem_url, timeout=60)
            data = response.json()
            # Extract the PubChem CID SMILES
            smiles = ''
            description = ''
            if "PropertyTable" in data:
                for prop in data["PropertyTable"]['Properties']:
                    smiles = prop.get("SMILES", '')
                    description = pubchem_cid_description(pubchem_cid)
            else:
                # If the PubChem ID is not found, set smiles and description to None
                smiles = None
                description = None
            enriched_pubchem_ids_smiles.append(smiles)
            enriched_pubchem_ids_descriptions.append(description)

        return enriched_pubchem_ids_descriptions, enriched_pubchem_ids_smiles

    def enrich_documents_with_rag(self, texts, docs):
        """
        Enrich a list of input PubChem IDs with their STRINGS representation.

        Args:
            texts: The list of pubchem IDs to be enriched.
            docs: None

        Returns:
            The list of enriched STRINGS
        """
        return self.enrich_documents(texts)

enrich_documents(texts)

Enrich a list of input PubChem IDs with their STRINGS representation.

Parameters:

Name Type Description Default
texts List[str]

The list of pubchem IDs to be enriched.

required

Returns:

Type Description
List[str]

The list of enriched STRINGS and their descriptions.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def enrich_documents(self, texts: List[str]) -> List[str]:
    """
    Enrich a list of input PubChem IDs with their STRINGS representation.

    Args:
        texts: The list of pubchem IDs to be enriched.

    Returns:
        The list of enriched STRINGS and their descriptions.
    """

    enriched_pubchem_ids_smiles = []
    enriched_pubchem_ids_descriptions = []

    # Load Hydra configuration to get the base URL for PubChem
    with hydra.initialize(version_base=None, config_path="../../configs"):
        cfg = hydra.compose(config_name='config',
                            overrides=['utils/pubchem_utils=default'])
        cfg = cfg.utils.pubchem_utils
    # Iterate over each PubChem ID in the input list
    pubchem_cids = texts
    for pubchem_cid in pubchem_cids:
        # Prepare the URL
        pubchem_url = f"{cfg.pubchem_cid2smiles_url}/{pubchem_cid}/property/smiles/JSON"
        # Get the data
        response = requests.get(pubchem_url, timeout=60)
        data = response.json()
        # Extract the PubChem CID SMILES
        smiles = ''
        description = ''
        if "PropertyTable" in data:
            for prop in data["PropertyTable"]['Properties']:
                smiles = prop.get("SMILES", '')
                description = pubchem_cid_description(pubchem_cid)
        else:
            # If the PubChem ID is not found, set smiles and description to None
            smiles = None
            description = None
        enriched_pubchem_ids_smiles.append(smiles)
        enriched_pubchem_ids_descriptions.append(description)

    return enriched_pubchem_ids_descriptions, enriched_pubchem_ids_smiles

enrich_documents_with_rag(texts, docs)

Enrich a list of input PubChem IDs with their STRINGS representation.

Parameters:

Name Type Description Default
texts

The list of pubchem IDs to be enriched.

required
docs

None

required

Returns:

Type Description

The list of enriched STRINGS

Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py
65
66
67
68
69
70
71
72
73
74
75
76
def enrich_documents_with_rag(self, texts, docs):
    """
    Enrich a list of input PubChem IDs with their STRINGS representation.

    Args:
        texts: The list of pubchem IDs to be enriched.
        docs: None

    Returns:
        The list of enriched STRINGS
    """
    return self.enrich_documents(texts)