Multimodal BioBridge-PrimeKG (IBD) Graph Construction¶
In this tutorial, we will perform a simple pre-processing task over BioBridge-PrimeKG dataset that employs multimodal data. In particular, we are using the pre-loaded embeddings which are already provided by BioBridge joined with PrimKG IBD dataset obtained from previous tutorial:
docs/notebooks/talk2knowledgegraphs/tutorial_primekg_subgraph.ipynb
First of all, we need to import necessary libraries as follows:
# Import necessary libraries
# %load_ext cudf.pandas
import os
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import blosc
from tqdm import tqdm
from torch_geometric.utils import from_networkx
import sys
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.datasets.starkqa_primekg import StarkQAPrimeKG
from aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg import BioBridgePrimeKG
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
# from aiagents4pharma.talk2knowledgegraphs.utils import kg_utils
# # Set the logging level for httpx to WARNING to suppress INFO messages
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)
/home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_geometric/typing.py:86: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_scatter/_version_cuda.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSsb warnings.warn(f"An issue occurred while importing 'torch-scatter'. " /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_geometric/typing.py:124: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_sparse/_version_cuda.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSsb warnings.warn(f"An issue occurred while importing 'torch-sparse'. " /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Prepare BioBridge dataset¶
The BioBridgePrimeKG
allows to load the data from related Github repository if the data is not available locally.
Otherwise, the data is loaded from the local directory as defined in the local_dir
and primekg_dir
.
# Define biobridge primekg data by providing a local directory where the data is stored
biobridge_data = BioBridgePrimeKG(primekg_dir="../../../../data/primekg/",
local_dir="../../../../data/biobridge_primekg/")
# Invoke a method to load the data
biobridge_data.load_data()
# Get the node information of the BioBridge PrimeKG
biobridge_node_info = biobridge_data.get_node_info_dict()
biobridge_node_info.keys()
Loading PrimeKG dataset... Loading nodes of PrimeKG dataset ... ../../../../data/primekg/primekg_nodes.tsv.gz already exists. Loading the data from the local directory. Loading edges of PrimeKG dataset ... ../../../../data/primekg/primekg_edges.tsv.gz already exists. Loading the data from the local directory. Loading data config file of BioBridgePrimeKG... File data_config.json already exists in ../../../../data/biobridge_primekg/. Building node embeddings... Building full triplets... Building train-test split...
dict_keys(['gene/protein', 'molecular_function', 'cellular_component', 'biological_process', 'drug', 'disease'])
We also utilize another source of information: StarkQA PrimeKG that provide us with the information of each node in the graph.
We can use StarkQAPrimeKG
class to load the data.
Subsequently, we can use the get_node_info_dict
method to obtain the node information of the StarkQA PrimeKG after loading the data using the load_data
method.
# As an additional source of information, we utilize StarkQA PrimeKG
starkqa_data = StarkQAPrimeKG(local_dir="../../../../data/starkqa_primekg/")
# Invoke a method to load the data
starkqa_data.load_data()
# Get the node information of the StarkQA PrimeKG
starkqa_node_info = starkqa_data.get_starkqa_node_info()
Loading StarkQAPrimeKG dataset... ../../../../data/starkqa_primekg/qa/prime/stark_qa/stark_qa.csv already exists. Loading the data from the local directory. Loading StarkQAPrimeKG embeddings...
The following codes will prepare the nodes and edges dataframes from the BioBridge dataset.
# Prepare BioBridge-PrimeKG edges
# Build the node index list
node_info_dict = {}
node_index_list = []
for i, node_type in enumerate(biobridge_data.preselected_node_types):
df_node = pd.read_csv(os.path.join(biobridge_data.local_dir, "processed", f"{node_type}.csv"))
node_info_dict[biobridge_data.node_type_map[node_type]] = df_node
node_index_list.extend(df_node["node_index"].tolist())
# Filter the PrimeKG dataset to take into account only the selected node types
edges_df = biobridge_data.primekg.get_edges().copy()
edges_df = edges_df[
edges_df["head_index"].isin(node_index_list) &\
edges_df["tail_index"].isin(node_index_list)
]
edges_df = edges_df.reset_index(drop=True)
# Further filtering out some nodes in the embedding dictionary
edges_df = edges_df[
edges_df["head_index"].isin(list(biobridge_data.emb_dict.keys())) &\
edges_df["tail_index"].isin(list(biobridge_data.emb_dict.keys()))
].reset_index(drop=True)
# Prepare BioBridge-PrimeKG nodes
nodes_df = biobridge_data.primekg.get_nodes().copy()
nodes_df = nodes_df[nodes_df["node_index"].isin(np.unique(np.concatenate([edges_df.head_index.unique(),
edges_df.tail_index.unique()])))].reset_index(drop=True)
As we would like to use a small subset of PrimeKG dataset in this tutorial, we will load IBD graph data and further filter the BioBridge-PrimeKG dataset with it.
# Load IBD PyG data to further filter the nodes
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/'
with open(os.path.join(local_dir, 'primekg_ibd_pyg_graph.pkl'), 'rb') as f:
ibd_pyg_graph = pickle.load(f)
# Get node name
ibd_node_name = [node.split('_')[0] for node in ibd_pyg_graph.node_id]
# ibd_node_name
# Filter the nodes using node name existing in the IBD PyG graph
nodes_df = nodes_df[nodes_df["node_name"].isin(ibd_node_name)].reset_index(drop=True)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | |
---|---|---|---|---|---|
0 | 144 | SMAD3 | NCBI | 4088 | gene/protein |
1 | 179 | IL10RB | NCBI | 3588 | gene/protein |
2 | 192 | GNA12 | NCBI | 2768 | gene/protein |
3 | 279 | HNF4A | NCBI | 3172 | gene/protein |
4 | 417 | VCAM1 | NCBI | 7412 | gene/protein |
# Check the number of nodes
print(f"Number of nodes: {len(nodes_df)}")
Number of nodes: 2991
# Check the number of edges
print(f"Number of edges: {len(edges_df)}")
Number of edges: 3904610
Modal-Specific Enrichment & Embedding¶
BioBridge dataset provides multimodal data for diverse node types, including: gene/proten, molecular_function, cellular_component, biological_process, drug, and disease. The following code snippet demonstrates how to obtain such information.
# Define feature columns
dict_feature_columns = {
"gene/protein": "sequence",
"molecular_function": "description",
"cellular_component": "description",
"biological_process": "description",
"drug": "smiles",
"disease": "definition",
}
# Obtain the node embeddings of the BioBridge
biobridge_node_embeddings = biobridge_data.get_node_embeddings()
Node Enrichment & Embedding¶
As mentioned earlier, we can use StarkQA PrimeKG dataset to simplify the enrichment process of textual data for the nodes.
def get_textual_enrichment(data, node_info):
"""
Enrich the node with additional information from StarkQA-PrimeKG
Args:
data (dict): The node data from PrimeKG
node_info (dict): The node information from StarkQA-PrimeKG
"""
# Basic textual enrichment of the node
enriched_node = f"{data['node_name']} belongs to {data['node_type']} node. "
# Only enrich the node if the node type is gene/protein, drug, disease, or pathway, which
# has additional information in the node_info of StarkQA-PrimeKG
added_info = ''
if data['node_type'] == 'gene/protein':
added_info += f"{data['node_name']} is {node_info['details']['name']}. " if 'name' in node_info['details'] else ''
added_info += node_info['details']['summary'] if 'summary' in node_info['details'] else ''
elif data['node_type'] == 'drug':
added_info = ' '.join([str(node_info['details']['description']).replace('nan', ''),
str(node_info['details']['mechanism_of_action']).replace('nan', ''),
str(node_info['details']['protein_binding']).replace('nan', ''),
str(node_info['details']['pharmacodynamics']).replace('nan', ''),
str(node_info['details']['indication']).replace('nan', '')])
elif data['node_type'] == 'disease':
added_info = ' '.join([str(node_info['details']['mondo_definition']).replace('nan', ''),
str(node_info['details']['mayo_symptoms']).replace('nan', ''),
str(node_info['details']['mayo_causes']).replace('nan', '')])
elif data['node_type'] == 'pathway':
added_info += f"This pathway found in {node_info['details']['speciesName']}. " + ' '.join([x['text'] for x in node_info['details']['summation']]) if 'details' in node_info else ''
# Append the additional information for enrichment
enriched_node += added_info
return enriched_node
# Enrich the node with additional textual description from StarkQA-PrimeKG
nodes_df["desc"] = nodes_df.apply(lambda x: get_textual_enrichment(x, starkqa_node_info[x['node_index']]), axis=1)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | |
---|---|---|---|---|---|---|
0 | 144 | SMAD3 | NCBI | 4088 | gene/protein | SMAD3 belongs to gene/protein node. SMAD3 is S... |
1 | 179 | IL10RB | NCBI | 3588 | gene/protein | IL10RB belongs to gene/protein node. IL10RB is... |
2 | 192 | GNA12 | NCBI | 2768 | gene/protein | GNA12 belongs to gene/protein node. GNA12 is G... |
3 | 279 | HNF4A | NCBI | 3172 | gene/protein | HNF4A belongs to gene/protein node. HNF4A is h... |
4 | 417 | VCAM1 | NCBI | 7412 | gene/protein | VCAM1 belongs to gene/protein node. VCAM1 is v... |
Afterwards, we will perform embeddings over such description column using the Ollama model (i.e., nomic-embed-text).
# Update textual pre-loaded embeddings from BioBride with 'nomic-embed-text' embeddings
# Using nomic-ai/nomic-embed-text-v1.5 model via Ollama
emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')
# Use mini-batch processing to perform the embedding
mini_batch_size = 100
desc_embeddings = []
for i in tqdm(range(0, nodes_df.shape[0], mini_batch_size)):
outputs = emb_model.embed_documents(nodes_df.desc.values.tolist()[i:i+mini_batch_size])
desc_embeddings.extend(outputs)
# Add them as features to the dataframe
nodes_df['desc_emb'] = desc_embeddings
nodes_df.head(5)
100%|██████████| 30/30 [00:17<00:00, 1.74it/s]
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | |
---|---|---|---|---|---|---|---|
0 | 144 | SMAD3 | NCBI | 4088 | gene/protein | SMAD3 belongs to gene/protein node. SMAD3 is S... | [0.029749377, 0.053500228, -0.1706713, -0.0258... |
1 | 179 | IL10RB | NCBI | 3588 | gene/protein | IL10RB belongs to gene/protein node. IL10RB is... | [0.028421732, 0.019860065, -0.16853006, -0.038... |
2 | 192 | GNA12 | NCBI | 2768 | gene/protein | GNA12 belongs to gene/protein node. GNA12 is G... | [0.003668847, 0.05138056, -0.13865656, -0.0554... |
3 | 279 | HNF4A | NCBI | 3172 | gene/protein | HNF4A belongs to gene/protein node. HNF4A is h... | [0.017971933, 0.021827668, -0.15494126, -0.000... |
4 | 417 | VCAM1 | NCBI | 7412 | gene/protein | VCAM1 belongs to gene/protein node. VCAM1 is v... | [0.04492683, 0.02438596, -0.15689379, -0.02166... |
We then obtain enriched node by using BioBridge data along with its embeddings.
# Obtain modality-specific information
nodes_df["feat"] = nodes_df.apply(lambda x:
biobridge_node_info[x["node_type"]][biobridge_node_info[x["node_type"]]["node_index"] == x["node_index"]][dict_feature_columns[x["node_type"]]].values[0], axis=1)
nodes_df["feat"] = nodes_df.apply(lambda x:
x["feat"]
if not pd.isnull(x["feat"]) else x["node_name"], axis=1)
nodes_df["feat_emb"] = nodes_df.apply(lambda x:
biobridge_node_embeddings[x["node_index"]]
if x["node_index"] in biobridge_node_embeddings else np.NaN, axis=1)
nodes_df.dropna(subset=["feat_emb"], inplace=True)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 144 | SMAD3 | NCBI | 4088 | gene/protein | SMAD3 belongs to gene/protein node. SMAD3 is S... | [0.029749377, 0.053500228, -0.1706713, -0.0258... | MSSILPFTPPIVKRLLGWKKGEQNGQEEKWCEKAVKSLVKKLKKTG... | [-0.014456028118729591, -0.03834506496787071, ... |
1 | 179 | IL10RB | NCBI | 3588 | gene/protein | IL10RB belongs to gene/protein node. IL10RB is... | [0.028421732, 0.019860065, -0.16853006, -0.038... | MAWSLGSWLGGCLLVSALGMVPPPENVRMNSVNFKNILQWESPAFA... | [-0.06711604446172714, 0.058091215789318085, 0... |
2 | 192 | GNA12 | NCBI | 2768 | gene/protein | GNA12 belongs to gene/protein node. GNA12 is G... | [0.003668847, 0.05138056, -0.13865656, -0.0554... | MSGVVRTLSRCLLPAEAGGARERRAGSGARDAEREARRRSRDIDAL... | [-0.015191752463579178, -0.13006462156772614, ... |
3 | 279 | HNF4A | NCBI | 3172 | gene/protein | HNF4A belongs to gene/protein node. HNF4A is h... | [0.017971933, 0.021827668, -0.15494126, -0.000... | MRLSKTLVDMDMADYSAALDPAYTTLEFENVQVLTMGNDTSPSEGT... | [0.0008836743654683232, 0.011145174503326416, ... |
4 | 417 | VCAM1 | NCBI | 7412 | gene/protein | VCAM1 belongs to gene/protein node. VCAM1 is v... | [0.04492683, 0.02438596, -0.15689379, -0.02166... | MPGKMVVILGASNILWIMFAASQAFKIETTPESRYLAQIGDSVSLT... | [0.008272849954664707, 0.04085301235318184, 0.... |
# Check if there are any NaN values in the enriched_node column
nodes_df["feat_emb"].isna().any()
False
Note that for nodes with textual embeddings, we will replace the original embeddings with the new ones that are retrieved from Ollama model (to be further used in the following talk2knowledgegraphs application).
# Update textual pre-loaded embeddings from BioBride with 'nomic-embed-text' embeddings
# Using nomic-ai/nomic-embed-text-v1.5 model via Ollama
emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')
# Since the records of nodes has large amount of data, we will split them into mini-batches
mini_batch_size = 100
text_based_df = nodes_df[nodes_df.node_type.isin(['disease', 'biological_process', 'cellular_component', 'molecular_function'])]
text_node_indexes = []
text_node_embeddings = []
for i in tqdm(range(0, text_based_df.shape[0], mini_batch_size)):
outputs = emb_model.embed_documents(text_based_df.feat.values.tolist()[i:i+mini_batch_size])
text_node_indexes.extend(text_based_df.node_index.values.tolist()[i:i+mini_batch_size])
text_node_embeddings.extend(outputs)
dic_text_embeddings = dict(zip(text_node_indexes, text_node_embeddings))
# dic_text_embeddings
100%|██████████| 22/22 [00:08<00:00, 2.45it/s]
# Replace the embeddings of the nodes with the updated embeddings for text-based nodes
nodes_df["feat_emb"] = nodes_df.apply(lambda x: dic_text_embeddings[x["node_index"]] if x["node_index"] in dic_text_embeddings else x["feat_emb"], axis=1)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 144 | SMAD3 | NCBI | 4088 | gene/protein | SMAD3 belongs to gene/protein node. SMAD3 is S... | [0.029749377, 0.053500228, -0.1706713, -0.0258... | MSSILPFTPPIVKRLLGWKKGEQNGQEEKWCEKAVKSLVKKLKKTG... | [-0.014456028118729591, -0.03834506496787071, ... |
1 | 179 | IL10RB | NCBI | 3588 | gene/protein | IL10RB belongs to gene/protein node. IL10RB is... | [0.028421732, 0.019860065, -0.16853006, -0.038... | MAWSLGSWLGGCLLVSALGMVPPPENVRMNSVNFKNILQWESPAFA... | [-0.06711604446172714, 0.058091215789318085, 0... |
2 | 192 | GNA12 | NCBI | 2768 | gene/protein | GNA12 belongs to gene/protein node. GNA12 is G... | [0.003668847, 0.05138056, -0.13865656, -0.0554... | MSGVVRTLSRCLLPAEAGGARERRAGSGARDAEREARRRSRDIDAL... | [-0.015191752463579178, -0.13006462156772614, ... |
3 | 279 | HNF4A | NCBI | 3172 | gene/protein | HNF4A belongs to gene/protein node. HNF4A is h... | [0.017971933, 0.021827668, -0.15494126, -0.000... | MRLSKTLVDMDMADYSAALDPAYTTLEFENVQVLTMGNDTSPSEGT... | [0.0008836743654683232, 0.011145174503326416, ... |
4 | 417 | VCAM1 | NCBI | 7412 | gene/protein | VCAM1 belongs to gene/protein node. VCAM1 is v... | [0.04492683, 0.02438596, -0.15689379, -0.02166... | MPGKMVVILGASNILWIMFAASQAFKIETTPESRYLAQIGDSVSLT... | [0.008272849954664707, 0.04085301235318184, 0.... |
# # Statistics of nodes
# print("Number of nodes in BioBridge-PrimeKG: %d" % nodes_df.shape[0])
# def store_data_into_blosc(data, path, filename, typesize=8, cname='zstd', clevel=9):
# """
# Store data into a blosc file.
# """
# # Create the directory if it doesn't exist
# os.makedirs(path, exist_ok=True)
# # Serialize the data using pickle
# serialized_data = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
# # Compress the serialized data using blosc
# compressed_data = blosc.compress(serialized_data, typesize=typesize, cname=cname, clevel=clevel)
# # Save the compressed data to a file
# with open(os.path.join(path, filename), 'wb') as f:
# f.write(compressed_data)
# print(f"Data is successfully stored in {os.path.join(path, filename)}")
# def load_data_from_blosc(path, filename):
# """
# Load data from a blosc file.
# """
# # Read the compressed data from the file
# with open(os.path.join(path, filename), 'rb') as f:
# compressed_data = f.read()
# # Decompress the data using blosc
# decompressed_data = blosc.decompress(compressed_data)
# # Deserialize the data using pickle
# data = pickle.loads(decompressed_data)
# # Return the data
# return data
# # We would like to store both the metadata and the embeddings as blosc files
# # Save the nodes dataframe
# local_dir = '../../../../data/biobridge_primekg/'
# store_data_into_blosc(nodes_df[["node_index", "node_name", "node_source", "node_id", "node_type"]],
# local_dir,
# 'biobridge_nodes.blosc')
# # Save the node embeddings (desc)
# store_data_into_blosc(dict(zip(nodes_df["node_index"], nodes_df["desc_x"])),
# local_dir,
# 'biobridge_nodes_desc_embeddings.blosc')
# # Save the node embeddings
# store_data_into_blosc(dict(zip(nodes_df["node_index"], nodes_df["x"])),
# local_dir,
# 'biobridge_nodes_embeddings.blosc')
# # Uncomment the following lines to load the data from the blosc files
# local_dir = '../../../../data/biobridge_primekg/'
# nodes_ = load_data_from_blosc(local_dir, 'biobridge_nodes.blosc')
# nodes_desc_embeddings_dict_ = load_data_from_blosc(local_dir, 'biobridge_nodes_desc_embeddings.blosc')
# nodes_embeddings_dict_ = load_data_from_blosc(local_dir, 'biobridge_nodes_embeddings.blosc')
# print("Number of nodes in BioBridge-PrimeKG: %d" % len(nodes_))
# print("Number of nodes embeddings in BioBridge-PrimeKG: %d" % len(nodes_embeddings_dict_))
# # Modify the node dataframe
# nodes_df["node"] = nodes_df.apply(lambda x: f"{x.node_name}_({x.node_index})", axis=1)
nodes_df["node_id"] = nodes_df.apply(lambda x: f"{x.node_name}_({x.node_index})", axis=1)
nodes_df.drop(columns=['node_source'], inplace=True)
nodes_df.rename(columns={'node_index': 'primekg_node_index'}, inplace=True)
nodes_df.reset_index(inplace=True)
nodes_df.rename(columns={'index': 'node_index'}, inplace=True)
nodes_df.head(5)
node_index | primekg_node_index | node_name | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 144 | SMAD3 | SMAD3_(144) | gene/protein | SMAD3 belongs to gene/protein node. SMAD3 is S... | [0.029749377, 0.053500228, -0.1706713, -0.0258... | MSSILPFTPPIVKRLLGWKKGEQNGQEEKWCEKAVKSLVKKLKKTG... | [-0.014456028118729591, -0.03834506496787071, ... |
1 | 1 | 179 | IL10RB | IL10RB_(179) | gene/protein | IL10RB belongs to gene/protein node. IL10RB is... | [0.028421732, 0.019860065, -0.16853006, -0.038... | MAWSLGSWLGGCLLVSALGMVPPPENVRMNSVNFKNILQWESPAFA... | [-0.06711604446172714, 0.058091215789318085, 0... |
2 | 2 | 192 | GNA12 | GNA12_(192) | gene/protein | GNA12 belongs to gene/protein node. GNA12 is G... | [0.003668847, 0.05138056, -0.13865656, -0.0554... | MSGVVRTLSRCLLPAEAGGARERRAGSGARDAEREARRRSRDIDAL... | [-0.015191752463579178, -0.13006462156772614, ... |
3 | 3 | 279 | HNF4A | HNF4A_(279) | gene/protein | HNF4A belongs to gene/protein node. HNF4A is h... | [0.017971933, 0.021827668, -0.15494126, -0.000... | MRLSKTLVDMDMADYSAALDPAYTTLEFENVQVLTMGNDTSPSEGT... | [0.0008836743654683232, 0.011145174503326416, ... |
4 | 4 | 417 | VCAM1 | VCAM1_(417) | gene/protein | VCAM1 belongs to gene/protein node. VCAM1 is v... | [0.04492683, 0.02438596, -0.15689379, -0.02166... | MPGKMVVILGASNILWIMFAASQAFKIETTPESRYLAQIGDSVSLT... | [0.008272849954664707, 0.04085301235318184, 0.... |
# Check the number of nodes
print(f"Number of nodes: {len(nodes_df)}")
Number of nodes: 2991
# Store node dataframe into two separated files: enrichment and embedding
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/ibd_biobridge_multimodal/nodes/'
os.makedirs(local_dir, exist_ok=True)
for nt in nodes_df.node_type.unique():
nt_ = nt.replace('/', '_')
# Enrichment
os.makedirs(os.path.join(local_dir, 'enrichment'), exist_ok=True)
nodes_df[nodes_df.node_type == nt][
["node_index", "primekg_node_index", "node_id", "node_name", "node_type", "desc", "feat"]
].to_parquet(
os.path.join(local_dir, "enrichment", f"{nt_}.parquet.gzip"),
compression='gzip',
index=False
)
# Embedding
os.makedirs(os.path.join(local_dir, 'embedding'), exist_ok=True)
nodes_df[nodes_df.node_type == nt][
["node_index", "node_id", "desc_emb", "feat_emb"]
].to_parquet(
os.path.join(local_dir, "embedding", f"{nt_}.parquet.gzip"),
compression='gzip',
index=False
)
Edge Enrichment & Embedding¶
We will also perform enrichment and embedding for the edges of the BioBridge-PrimeKG.
This time, we just use textual enrichment by using simple concatenation of the head, tail and relation.
# Filtering edges that exists in BioBridge PrimeKG
edges_df = edges_df[edges_df['head_index'].isin(nodes_df.primekg_node_index.unique()) &
edges_df['tail_index'].isin(nodes_df.primekg_node_index.unique())]
# Adding an additional column to the edges dataframe
edges_df["edge_type"] = edges_df.apply(lambda x: (x.head_type, x.display_relation, x.tail_type), axis=1)
# As of now, we are enriching each edge using textual information
# Perform textual enrichment over the edges by simply concatenating the head and tail nodes with the relation followed by the enriched node information
text_enriched_edges = edges_df.apply(lambda x: f"{x['head_name']} ({x['head_type']}) has a direct relationship of {x['relation']}:{x['display_relation']} with {x['tail_name']} ({x['tail_type']}).", axis=1).tolist()
edges_df['feat'] = text_enriched_edges
edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4104 | 1004 | IL1B | NCBI | 3553 | gene/protein | 772 | RELA | NCBI | 5970 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | IL1B (gene/protein) has a direct relationship ... |
11048 | 4968 | ICAM1 | NCBI | 3383 | gene/protein | 729 | STAT3 | NCBI | 6774 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | ICAM1 (gene/protein) has a direct relationship... |
17692 | 772 | RELA | NCBI | 5970 | gene/protein | 11134 | NR1H4 | NCBI | 9971 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | RELA (gene/protein) has a direct relationship ... |
17800 | 2384 | CRP | NCBI | 1401 | gene/protein | 2057 | FN1 | NCBI | 2335 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | CRP (gene/protein) has a direct relationship o... |
20031 | 3259 | TLR4 | NCBI | 7099 | gene/protein | 4731 | RIPK2 | NCBI | 8767 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | TLR4 (gene/protein) has a direct relationship ... |
By following the above steps when we filter the nodes with IBD data, we will also be able to filter the edges accordingly.
# Fiter the edges based on IBD PyG graph
ibd_edges_df = pd.DataFrame({
'head_index' : ibd_pyg_graph.head_id,
'tail_index' : ibd_pyg_graph.tail_id,
'edge_type' : ibd_pyg_graph.edge_type,
})
ibd_edges_df["head_index"] = ibd_edges_df["head_index"].apply(lambda x: int(x.split("_(")[1].replace(")", "")))
ibd_edges_df["tail_index"] = ibd_edges_df["tail_index"].apply(lambda x: int(x.split("_(")[1].replace(")", "")))
ibd_edges_df["display_relation"] = ibd_edges_df["edge_type"].apply(lambda x: x[1])
ibd_edges_df.drop(columns=["edge_type"], inplace=True)
# Merge the edges dataframe with the IBD edges dataframe
edges_df = pd.merge(edges_df, ibd_edges_df, how='inner', on=['head_index', 'tail_index', 'display_relation'], suffixes=('', '_y'))
edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14118 | Rose bengal | DrugBank | DB11182 | drug | 3233 | LTF | NCBI | 4057 | gene/protein | carrier | drug_protein | (drug, carrier, gene/protein) | Rose bengal (drug) has a direct relationship o... |
1 | 14038 | Fluticasone furoate | DrugBank | DB08906 | drug | 4152 | ABCB1 | NCBI | 5243 | gene/protein | carrier | drug_protein | (drug, carrier, gene/protein) | Fluticasone furoate (drug) has a direct relati... |
2 | 14555 | Technetium Tc-99m tetrofosmin | DrugBank | DB09160 | drug | 4152 | ABCB1 | NCBI | 5243 | gene/protein | carrier | drug_protein | (drug, carrier, gene/protein) | Technetium Tc-99m tetrofosmin (drug) has a dir... |
3 | 14040 | Fluticasone | DrugBank | DB13867 | drug | 4152 | ABCB1 | NCBI | 5243 | gene/protein | carrier | drug_protein | (drug, carrier, gene/protein) | Fluticasone (drug) has a direct relationship o... |
4 | 14060 | Levothyroxine | DrugBank | DB00451 | drug | 4152 | ABCB1 | NCBI | 5243 | gene/protein | enzyme | drug_protein | (drug, enzyme, gene/protein) | Levothyroxine (drug) has a direct relationship... |
After that, we perform the same embedding process for the edges using Ollama model.
# Using nomic-ai/nomic-embed-text-v1.5 model via Ollama
emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')
# Since the records of edges has large amount of data, we will split them into mini-batches
mini_batch_size = 100
edge_embeddings = []
for i in tqdm(range(0, edges_df.shape[0], mini_batch_size)):
outputs = emb_model.embed_documents(edges_df.feat.values.tolist()[i:i+mini_batch_size])
edge_embeddings.extend(outputs)
# Add them as features to the dataframe
edges_df['edge_emb'] = edge_embeddings
100%|██████████| 113/113 [00:44<00:00, 2.55it/s]
# Drop and rename several columns including the prefix_enriched_node column
edges_df.drop(columns=['head_source', 'head_id', 'head_type', 'tail_source', 'tail_id', 'tail_type', 'relation'], inplace=True)
edges_df.rename(columns={'head_index': 'primekg_head_index', 'tail_index': 'primekg_tail_index'}, inplace=True)
# Check dataframe of edges
edges_df.head(5)
primekg_head_index | head_name | primekg_tail_index | tail_name | display_relation | edge_type | feat | edge_emb | |
---|---|---|---|---|---|---|---|---|
0 | 14118 | Rose bengal | 3233 | LTF | carrier | (drug, carrier, gene/protein) | Rose bengal (drug) has a direct relationship o... | [0.071049586, 0.0060329223, -0.17035195, 0.001... |
1 | 14038 | Fluticasone furoate | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Fluticasone furoate (drug) has a direct relati... | [0.025471492, 0.054160915, -0.17022943, -0.018... |
2 | 14555 | Technetium Tc-99m tetrofosmin | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Technetium Tc-99m tetrofosmin (drug) has a dir... | [-0.008589362, 0.06356438, -0.14342338, -0.003... |
3 | 14040 | Fluticasone | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Fluticasone (drug) has a direct relationship o... | [0.021936357, 0.05227478, -0.16180754, -0.0218... |
4 | 14060 | Levothyroxine | 4152 | ABCB1 | enzyme | (drug, enzyme, gene/protein) | Levothyroxine (drug) has a direct relationship... | [0.023618879, 0.018524365, -0.1605938, 0.00940... |
# Make an additional edge index column as identifier
edges_df.reset_index(inplace=True)
edges_df.rename(columns={'index': 'triplet_index'}, inplace=True)
edges_df.head(5)
triplet_index | primekg_head_index | head_name | primekg_tail_index | tail_name | display_relation | edge_type | feat | edge_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 14118 | Rose bengal | 3233 | LTF | carrier | (drug, carrier, gene/protein) | Rose bengal (drug) has a direct relationship o... | [0.071049586, 0.0060329223, -0.17035195, 0.001... |
1 | 1 | 14038 | Fluticasone furoate | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Fluticasone furoate (drug) has a direct relati... | [0.025471492, 0.054160915, -0.17022943, -0.018... |
2 | 2 | 14555 | Technetium Tc-99m tetrofosmin | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Technetium Tc-99m tetrofosmin (drug) has a dir... | [-0.008589362, 0.06356438, -0.14342338, -0.003... |
3 | 3 | 14040 | Fluticasone | 4152 | ABCB1 | carrier | (drug, carrier, gene/protein) | Fluticasone (drug) has a direct relationship o... | [0.021936357, 0.05227478, -0.16180754, -0.0218... |
4 | 4 | 14060 | Levothyroxine | 4152 | ABCB1 | enzyme | (drug, enzyme, gene/protein) | Levothyroxine (drug) has a direct relationship... | [0.023618879, 0.018524365, -0.1605938, 0.00940... |
# Modify the edge dataframe
edges_df["head_id"] = edges_df.apply(lambda x: f"{x.head_name}_({x.primekg_head_index})", axis=1)
edges_df["tail_id"] = edges_df.apply(lambda x: f"{x.tail_name}_({x.primekg_tail_index})", axis=1)
edges_df.drop(columns=['head_name', 'tail_name'], inplace=True)
edges_df.reset_index(drop=True, inplace=True)
edges_df.head(5)
triplet_index | primekg_head_index | primekg_tail_index | display_relation | edge_type | feat | edge_emb | head_id | tail_id | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 14118 | 3233 | carrier | (drug, carrier, gene/protein) | Rose bengal (drug) has a direct relationship o... | [0.071049586, 0.0060329223, -0.17035195, 0.001... | Rose bengal_(14118) | LTF_(3233) |
1 | 1 | 14038 | 4152 | carrier | (drug, carrier, gene/protein) | Fluticasone furoate (drug) has a direct relati... | [0.025471492, 0.054160915, -0.17022943, -0.018... | Fluticasone furoate_(14038) | ABCB1_(4152) |
2 | 2 | 14555 | 4152 | carrier | (drug, carrier, gene/protein) | Technetium Tc-99m tetrofosmin (drug) has a dir... | [-0.008589362, 0.06356438, -0.14342338, -0.003... | Technetium Tc-99m tetrofosmin_(14555) | ABCB1_(4152) |
3 | 3 | 14040 | 4152 | carrier | (drug, carrier, gene/protein) | Fluticasone (drug) has a direct relationship o... | [0.021936357, 0.05227478, -0.16180754, -0.0218... | Fluticasone_(14040) | ABCB1_(4152) |
4 | 4 | 14060 | 4152 | enzyme | (drug, enzyme, gene/protein) | Levothyroxine (drug) has a direct relationship... | [0.023618879, 0.018524365, -0.1605938, 0.00940... | Levothyroxine_(14060) | ABCB1_(4152) |
# Add index columns for head and tail nodes
# Map head_id to head_index
edges_df = edges_df.merge(
nodes_df[["node_index", "node_id"]],
left_on="head_id",
right_on="node_id",
how="left"
).rename(columns={"node_index": "head_index"}).drop(columns=["node_id"])
# Merge to get tail_index
edges_df = edges_df.merge(
nodes_df[["node_index", "node_id"]],
left_on="tail_id",
right_on="node_id",
how="left"
).rename(columns={"node_index": "tail_index"}).drop(columns=["node_id"])
# Check the final edges dataframe
edges_df.head(5)
triplet_index | primekg_head_index | primekg_tail_index | display_relation | edge_type | feat | edge_emb | head_id | tail_id | head_index | tail_index | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 14118 | 3233 | carrier | (drug, carrier, gene/protein) | Rose bengal (drug) has a direct relationship o... | [0.071049586, 0.0060329223, -0.17035195, 0.001... | Rose bengal_(14118) | LTF_(3233) | 123 | 36 |
1 | 1 | 14038 | 4152 | carrier | (drug, carrier, gene/protein) | Fluticasone furoate (drug) has a direct relati... | [0.025471492, 0.054160915, -0.17022943, -0.018... | Fluticasone furoate_(14038) | ABCB1_(4152) | 99 | 47 |
2 | 2 | 14555 | 4152 | carrier | (drug, carrier, gene/protein) | Technetium Tc-99m tetrofosmin (drug) has a dir... | [-0.008589362, 0.06356438, -0.14342338, -0.003... | Technetium Tc-99m tetrofosmin_(14555) | ABCB1_(4152) | 320 | 47 |
3 | 3 | 14040 | 4152 | carrier | (drug, carrier, gene/protein) | Fluticasone (drug) has a direct relationship o... | [0.021936357, 0.05227478, -0.16180754, -0.0218... | Fluticasone_(14040) | ABCB1_(4152) | 100 | 47 |
4 | 4 | 14060 | 4152 | enzyme | (drug, enzyme, gene/protein) | Levothyroxine (drug) has a direct relationship... | [0.023618879, 0.018524365, -0.1605938, 0.00940... | Levothyroxine_(14060) | ABCB1_(4152) | 113 | 47 |
# Check the number of edges
print(f"Number of edges: {len(edges_df)}")
Number of edges: 11272
# Store node dataframe into two separated files: enrichment and embedding
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/ibd_biobridge_multimodal/edges/'
os.makedirs(local_dir, exist_ok=True)
# Enrichment
os.makedirs(os.path.join(local_dir, 'enrichment'), exist_ok=True)
edges_df['edge_type_str'] = edges_df.apply(lambda x: f"{x.edge_type[0]}|{x.edge_type[1]}|{x.edge_type[2]}", axis=1)
edges_df[
["triplet_index", "primekg_head_index", "primekg_tail_index", "head_id", "tail_id", "display_relation", "edge_type", "edge_type_str", "head_index", "tail_index", "feat"]
].to_parquet(
os.path.join(local_dir, 'enrichment', "edges.parquet.gzip"),
compression='gzip',
index=False
)
# Embedding
os.makedirs(os.path.join(local_dir, 'embedding'), exist_ok=True)
edges_df[
["triplet_index", "head_index", "tail_index", "edge_type_str", "edge_emb"]
].to_parquet(
os.path.join(local_dir, 'embedding', "edges.parquet.gzip"),
compression='gzip',
index=False
)