Enrichment class for enriching Gene names with their function and sequence using UniProt.
EnrichmentWithUniProt
Bases: Enrichments
Enrichment class using UniProt
Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 | class EnrichmentWithUniProt(Enrichments):
"""
Enrichment class using UniProt
"""
def enrich_documents(self, texts: list[str]) -> list[str]:
"""
Enrich a list of input UniProt gene names with their function and sequence.
Args:
texts: The list of gene names to be enriched.
Returns:
The list of enriched functions and sequences
"""
enriched_gene_names = texts
logger.log(
logging.INFO,
"Load Hydra configuration for Gene enrichment with description and sequence.",
)
with hydra.initialize(version_base=None, config_path="../../configs"):
cfg = hydra.compose(
config_name="config",
overrides=["utils/enrichments/uniprot_proteins=default"],
)
cfg = cfg.utils.enrichments.uniprot_proteins
descriptions = []
sequences = []
for gene in enriched_gene_names:
params = {
"reviewed": cfg.reviewed,
"isoform": cfg.isoform,
"exact_gene": gene,
"organism": cfg.organism,
# You can get the list of all available organisms here:
# https://www.uniprot.org/help/taxonomy
}
r = requests.get(
cfg.uniprot_url,
headers={"Accept": "application/json"},
params=params,
timeout=cfg.timeout,
)
# if the response is not ok
if not r.ok:
descriptions.append(None)
sequences.append(None)
continue
response_body = json.loads(r.text)
# if the response body is empty
if not response_body:
descriptions.append(None)
sequences.append(None)
continue
description = ""
for comment in response_body[0]["comments"]:
if comment["type"] == "FUNCTION":
for value in comment["text"]:
description += value["value"]
sequence = response_body[0]["sequence"]["sequence"]
descriptions.append(description)
sequences.append(sequence)
return descriptions, sequences
def enrich_documents_with_rag(self, texts, docs):
"""
Enrich a list of input UniProt gene names with their function and sequence.
Args:
texts: The list of gene names to be enriched.
Returns:
The list of enriched functions and sequences
"""
return self.enrich_documents(texts)
|
enrich_documents(texts)
Enrich a list of input UniProt gene names with their function and sequence.
Parameters:
Name |
Type |
Description |
Default |
texts
|
list[str]
|
The list of gene names to be enriched.
|
required
|
Returns:
Type |
Description |
list[str]
|
The list of enriched functions and sequences
|
Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86 | def enrich_documents(self, texts: list[str]) -> list[str]:
"""
Enrich a list of input UniProt gene names with their function and sequence.
Args:
texts: The list of gene names to be enriched.
Returns:
The list of enriched functions and sequences
"""
enriched_gene_names = texts
logger.log(
logging.INFO,
"Load Hydra configuration for Gene enrichment with description and sequence.",
)
with hydra.initialize(version_base=None, config_path="../../configs"):
cfg = hydra.compose(
config_name="config",
overrides=["utils/enrichments/uniprot_proteins=default"],
)
cfg = cfg.utils.enrichments.uniprot_proteins
descriptions = []
sequences = []
for gene in enriched_gene_names:
params = {
"reviewed": cfg.reviewed,
"isoform": cfg.isoform,
"exact_gene": gene,
"organism": cfg.organism,
# You can get the list of all available organisms here:
# https://www.uniprot.org/help/taxonomy
}
r = requests.get(
cfg.uniprot_url,
headers={"Accept": "application/json"},
params=params,
timeout=cfg.timeout,
)
# if the response is not ok
if not r.ok:
descriptions.append(None)
sequences.append(None)
continue
response_body = json.loads(r.text)
# if the response body is empty
if not response_body:
descriptions.append(None)
sequences.append(None)
continue
description = ""
for comment in response_body[0]["comments"]:
if comment["type"] == "FUNCTION":
for value in comment["text"]:
description += value["value"]
sequence = response_body[0]["sequence"]["sequence"]
descriptions.append(description)
sequences.append(sequence)
return descriptions, sequences
|
enrich_documents_with_rag(texts, docs)
Enrich a list of input UniProt gene names with their function and sequence.
Parameters:
Name |
Type |
Description |
Default |
texts
|
|
The list of gene names to be enriched.
|
required
|
Returns:
Type |
Description |
|
The list of enriched functions and sequences
|
Source code in aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py
88
89
90
91
92
93
94
95
96
97
98 | def enrich_documents_with_rag(self, texts, docs):
"""
Enrich a list of input UniProt gene names with their function and sequence.
Args:
texts: The list of gene names to be enriched.
Returns:
The list of enriched functions and sequences
"""
return self.enrich_documents(texts)
|