Multimodal BioBridge-PrimeKG Graph Construction¶
In this tutorial, we will perform a simple pre-processing task over BioBridge-PrimeKG dataset that employs multimodal data. In particular, we are using the pre-loaded embeddings which are already provided by BioBridge joined with PrimKG IBD dataset obtained from previous tutorial:
docs/notebooks/talk2knowledgegraphs/tutorial_primekg_subgraph.ipynb
First of all, we need to import necessary libraries as follows:
# Import necessary libraries
# %load_ext cudf.pandas
import os
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import blosc
import uuid
import json
import time
from openai import OpenAI
from tqdm import tqdm
from torch_geometric.utils import from_networkx
import sys
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.datasets.starkqa_primekg import StarkQAPrimeKG
from aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg import BioBridgePrimeKG
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
# from aiagents4pharma.talk2knowledgegraphs.utils import kg_utils
# # Set the logging level for httpx to WARNING to suppress INFO messages
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)
/home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_geometric/typing.py:86: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_scatter/_version_cuda.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSsb warnings.warn(f"An issue occurred while importing 'torch-scatter'. " /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_geometric/typing.py:124: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/torch_sparse/_version_cuda.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSsb warnings.warn(f"An issue occurred while importing 'torch-sparse'. " /home/awmulyadi/Repositories/temp/office2/AIAgents4Pharma/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
# # Set the OpenAI API key
# import os
# os.environ["OPENAI_API_KEY"] = "XXX" # Replace with your actual OpenAI API key
# import openai
# openai.api_key = os.getenv("OPENAI_API_KEY")
Prepare BioBridge dataset¶
The BioBridgePrimeKG
allows to load the data from related Github repository if the data is not available locally.
Otherwise, the data is loaded from the local directory as defined in the local_dir
and primekg_dir
.
# Define biobridge primekg data by providing a local directory where the data is stored
biobridge_data = BioBridgePrimeKG(primekg_dir="../../../../data/primekg/",
local_dir="../../../../data/biobridge_primekg/")
# Invoke a method to load the data
biobridge_data.load_data()
# Get the node information of the BioBridge PrimeKG
biobridge_node_info = biobridge_data.get_node_info_dict()
biobridge_node_info.keys()
Loading PrimeKG dataset... Loading nodes of PrimeKG dataset ... ../../../../data/primekg/primekg_nodes.tsv.gz already exists. Loading the data from the local directory. Loading edges of PrimeKG dataset ... ../../../../data/primekg/primekg_edges.tsv.gz already exists. Loading the data from the local directory. Loading data config file of BioBridgePrimeKG... File data_config.json already exists in ../../../../data/biobridge_primekg/. Building node embeddings... Building full triplets... Building train-test split...
dict_keys(['gene/protein', 'molecular_function', 'cellular_component', 'biological_process', 'drug', 'disease'])
We also utilize another source of information: StarkQA PrimeKG that provide us with the information of each node in the graph.
We can use StarkQAPrimeKG
class to load the data.
Subsequently, we can use the get_node_info_dict
method to obtain the node information of the StarkQA PrimeKG after loading the data using the load_data
method.
# As an additional source of information, we utilize StarkQA PrimeKG
starkqa_data = StarkQAPrimeKG(local_dir="../../../../data/starkqa_primekg/")
# Invoke a method to load the data
starkqa_data.load_data()
# Get the node information of the StarkQA PrimeKG
starkqa_node_info = starkqa_data.get_starkqa_node_info()
Loading StarkQAPrimeKG dataset... ../../../../data/starkqa_primekg/qa/prime/stark_qa/stark_qa.csv already exists. Loading the data from the local directory. Loading StarkQAPrimeKG embeddings...
The following codes will prepare the nodes and edges dataframes from the BioBridge dataset.
# Prepare BioBridge-PrimeKG edges
# Build the node index list
node_info_dict = {}
node_index_list = []
for i, node_type in enumerate(biobridge_data.preselected_node_types):
df_node = pd.read_csv(os.path.join(biobridge_data.local_dir, "processed", f"{node_type}.csv"))
node_info_dict[biobridge_data.node_type_map[node_type]] = df_node
node_index_list.extend(df_node["node_index"].tolist())
# Filter the PrimeKG dataset to take into account only the selected node types
edges_df = biobridge_data.primekg.get_edges().copy()
edges_df = edges_df[
edges_df["head_index"].isin(node_index_list) &\
edges_df["tail_index"].isin(node_index_list)
]
edges_df = edges_df.reset_index(drop=True)
# Further filtering out some nodes in the embedding dictionary
edges_df = edges_df[
edges_df["head_index"].isin(list(biobridge_data.emb_dict.keys())) &\
edges_df["tail_index"].isin(list(biobridge_data.emb_dict.keys()))
].reset_index(drop=True)
# Prepare BioBridge-PrimeKG nodes
nodes_df = biobridge_data.primekg.get_nodes().copy()
nodes_df = nodes_df[nodes_df["node_index"].isin(np.unique(np.concatenate([edges_df.head_index.unique(),
edges_df.tail_index.unique()])))].reset_index(drop=True)
# Check the number of nodes
print(f"Number of nodes: {len(nodes_df)}")
Number of nodes: 84981
# Check the number of edges
print(f"Number of edges: {len(edges_df)}")
Number of edges: 3904610
Modal-Specific Enrichment & Embedding¶
BioBridge dataset provides multimodal data for diverse node types, including: gene/proten, molecular_function, cellular_component, biological_process, drug, and disease. The following code snippet demonstrates how to obtain such information.
# Define feature columns
dict_feature_columns = {
"gene/protein": "sequence",
"molecular_function": "description",
"cellular_component": "description",
"biological_process": "description",
"drug": "smiles",
"disease": "definition",
}
# Obtain the node embeddings of the BioBridge
biobridge_node_embeddings = biobridge_data.get_node_embeddings()
Node Enrichment & Embedding¶
As mentioned earlier, we can use StarkQA PrimeKG dataset to simplify the enrichment process of textual data for the nodes.
def get_textual_enrichment(data, node_info):
"""
Enrich the node with additional information from StarkQA-PrimeKG
Args:
data (dict): The node data from PrimeKG
node_info (dict): The node information from StarkQA-PrimeKG
"""
# Basic textual enrichment of the node
enriched_node = f"{data['node_name']} belongs to {data['node_type']} node. "
# Only enrich the node if the node type is gene/protein, drug, disease, or pathway, which
# has additional information in the node_info of StarkQA-PrimeKG
added_info = ''
if data['node_type'] == 'gene/protein':
added_info += f"{data['node_name']} is {node_info['details']['name']}. " if 'name' in node_info['details'] else ''
added_info += node_info['details']['summary'] if 'summary' in node_info['details'] else ''
elif data['node_type'] == 'drug':
added_info = ' '.join([str(node_info['details']['description']).replace('nan', ''),
str(node_info['details']['mechanism_of_action']).replace('nan', ''),
str(node_info['details']['protein_binding']).replace('nan', ''),
str(node_info['details']['pharmacodynamics']).replace('nan', ''),
str(node_info['details']['indication']).replace('nan', '')])
elif data['node_type'] == 'disease':
added_info = ' '.join([str(node_info['details']['mondo_definition']).replace('nan', ''),
str(node_info['details']['mayo_symptoms']).replace('nan', ''),
str(node_info['details']['mayo_causes']).replace('nan', '')])
elif data['node_type'] == 'pathway':
added_info += f"This pathway found in {node_info['details']['speciesName']}. " + ' '.join([x['text'] for x in node_info['details']['summation']]) if 'details' in node_info else ''
# Append the additional information for enrichment
enriched_node += added_info
return enriched_node
# Enrich the node with additional textual description from StarkQA-PrimeKG
nodes_df["desc"] = nodes_df.apply(lambda x: get_textual_enrichment(x, starkqa_node_info[x['node_index']]), axis=1)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | |
---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... |
Afterwards, we will perform embeddings over such description column using the OpenAI API embedding model.
# # Embeddings using OpenAI API batch processing
# batch_size = 50000
# output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
# os.makedirs(output_dir, exist_ok=True)
# # Loop through the nodes in batches to process embeddings
# docs = nodes_df.desc.values.tolist()
# doc_ids = nodes_df.node_index.values.tolist()
# nodes_batch_filenames = []
# for i in range(0, len(docs), batch_size):
# batch_docs = docs[i:i + batch_size]
# batch_doc_ids = doc_ids[i:i + batch_size]
# batch_filename = os.path.join(output_dir, f'nodes_batch_{i // batch_size + 1}.jsonl')
# nodes_batch_filenames.append(batch_filename)
# # Write the batch to a file
# with open(batch_filename, 'w', encoding='utf-8') as f:
# for idx, text in enumerate(batch_docs):
# record = {
# "custom_id": f"text_{batch_doc_ids[idx]}",
# "method": "POST",
# "url": "/v1/embeddings",
# "body": {
# "model": "text-embedding-ada-002",
# "input": text
# }
# }
# f.write(json.dumps(record) + '\n')
# client = OpenAI()
# # Keep track of all batch metadata
# submitted_batches = []
# # Loop through each batch file and submit to OpenAI
# for batch_filename in nodes_batch_filenames:
# # Upload the file
# print(f"Uploading file: {batch_filename}")
# uploaded_file = client.files.create(
# file=open(batch_filename, "rb"),
# purpose="batch"
# )
# print(f"Uploaded file ID: {uploaded_file.id}")
# # Create a batch job
# batch_job = client.batches.create(
# input_file_id=uploaded_file.id,
# endpoint="/v1/embeddings",
# completion_window="24h" # Options: "24h", "1h"
# )
# print(f"Submitted batch ID: {batch_job.id}")
# submitted_batches.append({
# "batch_filename": batch_filename,
# "file_id": uploaded_file.id,
# "batch_id": batch_job.id,
# "status": batch_job.status
# })
# # Optional short pause to avoid rate limits
# time.sleep(2)
# # Save metadata for all submitted batches
# batch_metadata_df = pd.DataFrame(submitted_batches)
# batch_metadata_df.to_csv(os.path.join(output_dir, "nodes_submitted_batches_metadata.csv"), index=False)
# print("Saved batch tracking metadata for nodes.")
# Load metadata of the submitted batches
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
batch_metadata_df = pd.read_csv(os.path.join(output_dir, "nodes_submitted_batches_metadata.csv"))
batch_metadata_df
batch_filename | file_id | batch_id | status | |
---|---|---|---|---|
0 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-TYNmj47t8gkufQ2U7Q9y4M | batch_683422562fd481909708b9a48fbf7868 | completed |
1 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-WzogrHr4CQbZ45TfhnfLWH | batch_683422607a808190b840f8089e296d6c | completed |
# We just downloaded the ouputs from the OpenAI API, so we can load them directly
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/results/'
output_files = [os.path.join(output_dir, f'{b}_output.jsonl') for b in batch_metadata_df.batch_id.values.tolist()]
output_files
# Load the embeddings from the output files
def load_embeddings_from_output_files(output_files):
embeddings = []
for output_file in output_files:
with open(output_file, 'r', encoding='utf-8') as f:
for line in f:
record = json.loads(line)
custom_id = record.get('custom_id')
embedding = record['response']['body']['data'][0]['embedding']
embeddings.append([custom_id, embedding])
return pd.DataFrame(embeddings, columns=['custom_id', 'embedding'])
# Load the embeddings
nodes_desc_embeddings_df = load_embeddings_from_output_files(output_files)
# Check the shape of the embeddings DataFrame
print(f"Shape of nodes_desc_embeddings_df: {nodes_desc_embeddings_df.shape}")
print(f"Shape of nodes_df: {nodes_df.shape}")
Shape of nodes_desc_embeddings_df: (84981, 2) Shape of nodes_df: (84981, 6)
# Further preprocess the resulted embeddings
nodes_desc_embeddings_df['node_index'] = nodes_desc_embeddings_df['custom_id'].apply(lambda x: int(x.split('_')[1]))
nodes_desc_embeddings_df.sort_values(by='node_index', inplace=True)
nodes_desc_embeddings_df.reset_index(drop=True, inplace=True)
nodes_desc_embeddings_df.drop(columns=['custom_id'], inplace=True)
nodes_desc_embeddings_df
embedding | node_index | |
---|---|---|
0 | [-0.038923346, -0.022871112, -0.012125405, -0.... | 0 |
1 | [-0.025375651, 0.012858219, 0.008264126, -0.00... | 1 |
2 | [-0.032085866, 0.0071205534, -0.017097335, -0.... | 2 |
3 | [-0.030888347, -0.024794728, -0.020263912, -0.... | 3 |
4 | [-0.029845022, -0.023542346, -0.01622012, -0.0... | 4 |
... | ... | ... |
84976 | [-0.0049393373, -0.0011428966, -0.01078287, -0... | 127430 |
84977 | [-0.009917359, 0.009485583, -0.00342385, -0.00... | 127431 |
84978 | [-0.011255736, 0.004252167, -0.0042116055, 0.0... | 127432 |
84979 | [-0.008864644, 0.022069765, 0.0050582215, -0.0... | 127433 |
84980 | [-0.01191322, -0.004686002, 0.011682817, -0.01... | 127434 |
84981 rows × 2 columns
# Merge the embeddings with the nodes DataFrame
nodes_df = nodes_df.merge(nodes_desc_embeddings_df, on='node_index', how='left')
nodes_df.rename(columns={'embedding': 'desc_emb'}, inplace=True)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | |
---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... | [-0.038923346, -0.022871112, -0.012125405, -0.... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... | [-0.025375651, 0.012858219, 0.008264126, -0.00... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... | [-0.032085866, 0.0071205534, -0.017097335, -0.... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... | [-0.030888347, -0.024794728, -0.020263912, -0.... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... | [-0.029845022, -0.023542346, -0.01622012, -0.0... |
We then obtain enriched node by using BioBridge data along with its embeddings.
# Obtain modality-specific information
nodes_df["feat"] = nodes_df.apply(lambda x:
biobridge_node_info[x["node_type"]][biobridge_node_info[x["node_type"]]["node_index"] == x["node_index"]][dict_feature_columns[x["node_type"]]].values[0], axis=1)
nodes_df["feat"] = nodes_df.apply(lambda x:
x["feat"]
if not pd.isnull(x["feat"]) else x["node_name"], axis=1)
nodes_df["feat_emb"] = nodes_df.apply(lambda x:
biobridge_node_embeddings[x["node_index"]]
if x["node_index"] in biobridge_node_embeddings else np.NaN, axis=1)
nodes_df.dropna(subset=["feat_emb"], inplace=True)
nodes_df.head(5)
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... | [-0.038923346, -0.022871112, -0.012125405, -0.... | MELLSTPHSIEINNITCDSFRISWAMEDSDLERVTHYFIDLNKKEN... | [0.04029838368296623, -0.018344514071941376, 0... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... | [-0.025375651, 0.012858219, 0.008264126, -0.00... | MSRPLLITFTPATDPSDLWKDGQQQPQPEKPESTLDGAAARAFYEA... | [-0.049913737922906876, -0.04380067065358162, ... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... | [-0.032085866, 0.0071205534, -0.017097335, -0.... | MAAPEKMTFPEKPSHKKYRAALKKEKRKKRRQELARLRDSGLSQKE... | [0.035360466688871384, -0.09613325446844101, 0... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... | [-0.030888347, -0.024794728, -0.020263912, -0.... | MEEHGVTQTEHMATIEAHAVAQQVQQVHVATYTEHSMLSADEDSPS... | [-0.052261918783187866, -0.022747397422790527,... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... | [-0.029845022, -0.023542346, -0.01622012, -0.0... | MAAAPARGGGGGGGGGGGCSGSGSSASRGFYFNTVLSLARSLAVQR... | [0.005174526944756508, -0.049968406558036804, ... |
# Check if there are any NaN values in the feature column
nodes_df["feat_emb"].isna().any()
False
Note that for nodes with textual embeddings, we will replace the original embeddings with the new ones that are retrieved from Ollama model (to be further used in the following talk2knowledgegraphs application).
# # Embeddings using OpenAI API batch processing
# batch_size = 50000
# output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
# os.makedirs(output_dir, exist_ok=True)
# # Loop through the nodes in batches to process embeddings
# docs = nodes_df[nodes_df.node_type.isin(['disease', 'biological_process', 'cellular_component', 'molecular_function'])].feat.to_list()
# doc_ids = nodes_df.node_index.values.tolist()
# nodes_feat_batch_filenames = []
# for i in range(0, len(docs), batch_size):
# batch_docs = docs[i:i + batch_size]
# batch_doc_ids = doc_ids[i:i + batch_size]
# batch_filename = os.path.join(output_dir, f'nodes_feat_batch_{i // batch_size + 1}.jsonl')
# nodes_feat_batch_filenames.append(batch_filename)
# # Write the batch to a file
# with open(batch_filename, 'w', encoding='utf-8') as f:
# for idx, text in enumerate(batch_docs):
# record = {
# "custom_id": f"text_{batch_doc_ids[idx]}",
# "method": "POST",
# "url": "/v1/embeddings",
# "body": {
# "model": "text-embedding-ada-002",
# "input": text
# }
# }
# f.write(json.dumps(record) + '\n')
# client = OpenAI()
# # Keep track of all batch metadata
# submitted_batches = []
# # Loop through each batch file and submit to OpenAI
# for batch_filename in nodes_feat_batch_filenames[1]:
# # Upload the file
# print(f"Uploading file: {batch_filename}")
# uploaded_file = client.files.create(
# file=open(batch_filename, "rb"),
# purpose="batch"
# )
# print(f"Uploaded file ID: {uploaded_file.id}")
# # Create a batch job
# batch_job = client.batches.create(
# input_file_id=uploaded_file.id,
# endpoint="/v1/embeddings",
# completion_window="24h" # Options: "24h", "1h"
# )
# print(f"Submitted batch ID: {batch_job.id}")
# submitted_batches.append({
# "batch_filename": batch_filename,
# "file_id": uploaded_file.id,
# "batch_id": batch_job.id,
# "status": batch_job.status
# })
# # Optional short pause to avoid rate limits
# time.sleep(2)
# # Save metadata for all submitted batches
# batch_metadata_df = pd.DataFrame(submitted_batches)
# batch_metadata_df.to_csv(os.path.join(output_dir, "nodes_feat_submitted_batches_metadata.csv"), index=False)
# print("Saved batch tracking metadata for nodes.")
# # Example: check status of the submitted batches
# print(check_batch_status(submitted_batches[0]['batch_id']))
# print(check_batch_status(submitted_batches[1]['batch_id']))
# Load metadata of the submitted batches
# Load metadata of the submitted batches
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
batch_metadata_df = pd.read_csv(os.path.join(output_dir, "nodes_feat_submitted_batches_metadata.csv"))
batch_metadata_df
batch_filename | file_id | batch_id | status | |
---|---|---|---|---|
0 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-UG9y4GnWs6mdV1Z91RDnqC | batch_6834498eea988190a74d2ccaac019543 | completed |
1 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-LuHSnwWGZihbBKch6vYzdK | batch_68347d33a6b881908a18d53628784570 | completed |
2 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-GcDbosVHR5k2w6qeBkN1sz | batch_68347e0ff8288190a1073aa7b2c24518 | completed |
# We just downloaded the ouputs from the OpenAI API, so we can load them directly
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/results/'
output_files = [os.path.join(output_dir, f'{b}_output.jsonl') for b in batch_metadata_df.batch_id.values.tolist()]
output_files
# Load the embeddings
nodes_feat_embeddings_df = load_embeddings_from_output_files(output_files)
# Check the shape of the embeddings DataFrame
print(f"Shape of nodes_desc_embeddings_df: {nodes_feat_embeddings_df.shape}")
print(f"Shape of nodes_df (text-based): {nodes_df[nodes_df.node_type.isin(['disease', 'biological_process', 'cellular_component', 'molecular_function'])].shape}")
Shape of nodes_desc_embeddings_df: (59425, 2) Shape of nodes_df (text-based): (59425, 9)
# Further preprocess the resulted embeddings
nodes_feat_embeddings_df['node_index'] = nodes_feat_embeddings_df['custom_id'].apply(lambda x: int(x.split('_')[1]))
nodes_feat_embeddings_df.sort_values(by='node_index', inplace=True)
nodes_feat_embeddings_df.reset_index(drop=True, inplace=True)
nodes_feat_embeddings_df.drop(columns=['custom_id'], inplace=True)
nodes_feat_embeddings_df
embedding | node_index | |
---|---|---|
0 | [-0.020009642, 0.0059901476, -0.019181218, -0.... | 0 |
1 | [-0.025067309, 0.026195658, 0.012629821, -0.02... | 1 |
2 | [0.0048196767, 0.004307966, 0.0130102495, -0.0... | 2 |
3 | [-0.031480603, 0.022127304, 0.0030120523, -0.0... | 3 |
4 | [-0.019105745, 0.005438388, 0.0045395615, -0.0... | 4 |
... | ... | ... |
59420 | [-0.0069843708, 0.0024856697, 0.0027306252, -0... | 100958 |
59421 | [-0.022267014, 0.011964366, 0.0037708203, -0.0... | 100959 |
59422 | [-0.03591224, -0.0031208533, -0.00097246305, 0... | 100960 |
59423 | [-0.03000842, 0.02487104, 0.011887363, -0.0152... | 100961 |
59424 | [-0.018154941, -0.02543502, -0.002073788, -0.0... | 100962 |
59425 rows × 2 columns
# Merge the embeddings with the nodes DataFrame
nodes_df = nodes_df.merge(nodes_feat_embeddings_df, on='node_index', how='left')
nodes_df['feat_emb'] = nodes_df['embedding'].fillna(nodes_df['feat_emb'])
nodes_df = nodes_df.drop(columns=['embedding'])
nodes_df
node_index | node_name | node_source | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... | [-0.038923346, -0.022871112, -0.012125405, -0.... | MELLSTPHSIEINNITCDSFRISWAMEDSDLERVTHYFIDLNKKEN... | [-0.020009642, 0.0059901476, -0.019181218, -0.... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... | [-0.025375651, 0.012858219, 0.008264126, -0.00... | MSRPLLITFTPATDPSDLWKDGQQQPQPEKPESTLDGAAARAFYEA... | [-0.025067309, 0.026195658, 0.012629821, -0.02... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... | [-0.032085866, 0.0071205534, -0.017097335, -0.... | MAAPEKMTFPEKPSHKKYRAALKKEKRKKRRQELARLRDSGLSQKE... | [0.0048196767, 0.004307966, 0.0130102495, -0.0... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... | [-0.030888347, -0.024794728, -0.020263912, -0.... | MEEHGVTQTEHMATIEAHAVAQQVQQVHVATYTEHSMLSADEDSPS... | [-0.031480603, 0.022127304, 0.0030120523, -0.0... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... | [-0.029845022, -0.023542346, -0.01622012, -0.0... | MAAAPARGGGGGGGGGGGCSGSGSSASRGFYFNTVLSLARSLAVQR... | [-0.019105745, 0.005438388, 0.0045395615, -0.0... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
84976 | 127430 | host cell rough endoplasmic reticulum membrane | GO | 44169 | cellular_component | host cell rough endoplasmic reticulum membrane... | [-0.0049393373, -0.0011428966, -0.01078287, -0... | The lipid bilayer surrounding the host cell ro... | [0.16958962380886078, -0.35165151953697205, -0... |
84977 | 127431 | collagen type VII anchoring fibril | GO | 98652 | cellular_component | collagen type VII anchoring fibril belongs to ... | [-0.009917359, 0.009485583, -0.00342385, -0.00... | An antiparallel dimer of two collagen VII trim... | [-0.25522512197494507, -0.48963257670402527, -... |
84978 | 127432 | cofilin-actin rod | GO | 90732 | cellular_component | cofilin-actin rod belongs to cellular_componen... | [-0.011255736, 0.004252167, -0.0042116055, 0.0... | A cellular structure consisting of parallel, h... | [0.017420507967472076, -0.17719775438308716, -... |
84979 | 127433 | condensed chromosome, centromeric region | GO | 779 | cellular_component | condensed chromosome, centromeric region belon... | [-0.008864644, 0.022069765, 0.0050582215, -0.0... | The region of a condensed chromosome that incl... | [-0.12538260221481323, -0.6296162009239197, -0... |
84980 | 127434 | plastid acetyl-CoA carboxylase complex | GO | 32282 | cellular_component | plastid acetyl-CoA carboxylase complex belongs... | [-0.01191322, -0.004686002, 0.011682817, -0.01... | An acetyl-CoA carboxylase complex located in t... | [0.052605681121349335, -0.2763552963733673, -0... |
84981 rows × 9 columns
# # Using nomic-ai/nomic-embed-text-v1.5 model via Ollama
# emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')
# # Since the records of nodes has large amount of data, we will split them into mini-batches
# mini_batch_size = 100
# text_based_df = nodes_df[nodes_df.node_type.isin(['disease', 'biological_process', 'cellular_component', 'molecular_function'])]
# text_node_indexes = []
# text_node_embeddings = []
# for i in tqdm(range(0, text_based_df.shape[0], mini_batch_size)):
# outputs = emb_model.embed_documents(text_based_df.feat.values.tolist()[i:i+mini_batch_size])
# text_node_indexes.extend(text_based_df.node_index.values.tolist()[i:i+mini_batch_size])
# text_node_embeddings.extend(outputs)
# dic_text_embeddings = dict(zip(text_node_indexes, text_node_embeddings))
# # dic_text_embeddings
# Replace the embeddings of the nodes with the updated embeddings for text-based nodes
# nodes_df["feat_emb"] = nodes_df.apply(lambda x: dic_text_embeddings[x["node_index"]] if x["node_index"] in dic_text_embeddings else x["feat_emb"], axis=1)
# nodes_df.head(5)
# # Modify the node dataframe
# nodes_df["node"] = nodes_df.apply(lambda x: f"{x.node_name}_({x.node_index})", axis=1)
nodes_df["node_id"] = nodes_df.apply(lambda x: f"{x.node_name}_({x.node_index})", axis=1)
nodes_df.drop(columns=['node_source'], inplace=True)
nodes_df.rename(columns={'node_index': 'primekg_node_index'}, inplace=True)
nodes_df.reset_index(inplace=True)
nodes_df.rename(columns={'index': 'node_index'}, inplace=True)
nodes_df.head(5)
node_index | primekg_node_index | node_name | node_id | node_type | desc | desc_emb | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | PHYHIP | PHYHIP_(0) | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... | [-0.038923346, -0.022871112, -0.012125405, -0.... | MELLSTPHSIEINNITCDSFRISWAMEDSDLERVTHYFIDLNKKEN... | [-0.020009642, 0.0059901476, -0.019181218, -0.... |
1 | 1 | 1 | GPANK1 | GPANK1_(1) | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... | [-0.025375651, 0.012858219, 0.008264126, -0.00... | MSRPLLITFTPATDPSDLWKDGQQQPQPEKPESTLDGAAARAFYEA... | [-0.025067309, 0.026195658, 0.012629821, -0.02... |
2 | 2 | 2 | ZRSR2 | ZRSR2_(2) | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... | [-0.032085866, 0.0071205534, -0.017097335, -0.... | MAAPEKMTFPEKPSHKKYRAALKKEKRKKRRQELARLRDSGLSQKE... | [0.0048196767, 0.004307966, 0.0130102495, -0.0... |
3 | 3 | 3 | NRF1 | NRF1_(3) | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... | [-0.030888347, -0.024794728, -0.020263912, -0.... | MEEHGVTQTEHMATIEAHAVAQQVQQVHVATYTEHSMLSADEDSPS... | [-0.031480603, 0.022127304, 0.0030120523, -0.0... |
4 | 4 | 4 | PI4KA | PI4KA_(4) | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... | [-0.029845022, -0.023542346, -0.01622012, -0.0... | MAAAPARGGGGGGGGGGGCSGSGSSASRGFYFNTVLSLARSLAVQR... | [-0.019105745, 0.005438388, 0.0045395615, -0.0... |
# Store node dataframe into two separated files: enrichment and embedding
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/nodes/'
os.makedirs(local_dir, exist_ok=True)
for nt in nodes_df.node_type.unique():
nt_ = nt.replace('/', '_')
# Enrichment
os.makedirs(os.path.join(local_dir, 'enrichment'), exist_ok=True)
nodes_df[nodes_df.node_type == nt][
["node_index", "primekg_node_index", "node_id", "node_name", "node_type", "desc", "feat"]
].to_parquet(
os.path.join(local_dir, 'enrichment', f"{nt_}.parquet.gzip"),
compression='gzip',
index=False
)
# Embedding
os.makedirs(os.path.join(local_dir, 'embedding'), exist_ok=True)
nodes_df[nodes_df.node_type == nt][
["node_id", "desc_emb", "feat_emb"]
].to_parquet(
os.path.join(local_dir, 'embedding', f"{nt_}.parquet.gzip"),
compression='gzip',
index=False
)
# Load the nodes dataframes from the files
import glob
import cudf # use cudf for GPU acceleration
# Make an empty dictionary for each folder
nodes_df = {}
file_list = glob.glob(os.path.join("../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/",
"nodes", "enrichment", '*.parquet.gzip'))
nodes_df = cudf.concat([cudf.read_parquet(f) for f in file_list], ignore_index=True)
nodes_df = nodes_df.sort_values(by='node_index').reset_index(drop=True)
nodes_df.head(5)
INFO:numba.cuda.cudadrv.driver:init
node_index | primekg_node_index | node_id | node_name | node_type | desc | feat | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | PHYHIP_(0) | PHYHIP | gene/protein | PHYHIP belongs to gene/protein node. PHYHIP is... | MELLSTPHSIEINNITCDSFRISWAMEDSDLERVTHYFIDLNKKEN... |
1 | 1 | 1 | GPANK1_(1) | GPANK1 | gene/protein | GPANK1 belongs to gene/protein node. GPANK1 is... | MSRPLLITFTPATDPSDLWKDGQQQPQPEKPESTLDGAAARAFYEA... |
2 | 2 | 2 | ZRSR2_(2) | ZRSR2 | gene/protein | ZRSR2 belongs to gene/protein node. ZRSR2 is z... | MAAPEKMTFPEKPSHKKYRAALKKEKRKKRRQELARLRDSGLSQKE... |
3 | 3 | 3 | NRF1_(3) | NRF1 | gene/protein | NRF1 belongs to gene/protein node. NRF1 is nuc... | MEEHGVTQTEHMATIEAHAVAQQVQQVHVATYTEHSMLSADEDSPS... |
4 | 4 | 4 | PI4KA_(4) | PI4KA | gene/protein | PI4KA belongs to gene/protein node. PI4KA is p... | MAAAPARGGGGGGGGGGGCSGSGSSASRGFYFNTVLSLARSLAVQR... |
Edge Enrichment & Embedding¶
We will also perform enrichment and embedding for the edges of the BioBridge-PrimeKG.
This time, we just use textual enrichment by using simple concatenation of the head, tail and relation.
# Filtering edges that exists in BioBridge PrimeKG
edges_df = edges_df[edges_df['head_index'].isin(nodes_df.to_pandas().primekg_node_index.unique()) &
edges_df['tail_index'].isin(nodes_df.to_pandas().primekg_node_index.unique())]
# Adding an additional column to the edges dataframe
edges_df["edge_type"] = edges_df.apply(lambda x: (x.head_type, x.display_relation, x.tail_type), axis=1)
edges_df["edge_type_str"] = edges_df.apply(lambda x: f"{x.head_type}|{x.display_relation}|{x.tail_type}", axis=1)
edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | edge_type_str | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | 8889 | KIF15 | NCBI | 56992 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | 2798 | PNMA1 | NCBI | 9240 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | 5646 | TTC33 | NCBI | 23548 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | 11592 | MAN1B1 | NCBI | 11253 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | 2122 | RGS20 | NCBI | 8601 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein |
# As of now, we are enriching each edge using textual information
# Perform textual enrichment over the edges by simply concatenating the head and tail nodes with the relation followed by the enriched node information
text_enriched_edges = edges_df.apply(lambda x: f"{x['head_name']} ({x['head_type']}) has a direct relationship of {x['relation']}:{x['display_relation']} with {x['tail_name']} ({x['tail_type']}).", axis=1).tolist()
edges_df['feat'] = text_enriched_edges
edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | edge_type_str | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | 8889 | KIF15 | NCBI | 56992 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | 2798 | PNMA1 | NCBI | 9240 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | 5646 | TTC33 | NCBI | 23548 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | 11592 | MAN1B1 | NCBI | 11253 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | 2122 | RGS20 | NCBI | 8601 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... |
# # Embeddings using OpenAI API batch processing
# batch_size = 50000
# output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
# os.makedirs(output_dir, exist_ok=True)
# # Loop through the nodes in batches to process embeddings
# docs = edges_df.feat.to_list()
# doc_ids = edges_df[["head_index",
# "tail_index",
# "display_relation"]].apply(lambda x: f"{x['head_index']}_{x['tail_index']}_{x['display_relation']}", axis=1).to_list()
# edges_feat_batch_filenames = []
# for i in range(0, len(docs), batch_size):
# batch_docs = docs[i:i + batch_size]
# batch_doc_ids = doc_ids[i:i + batch_size]
# batch_filename = os.path.join(output_dir, f'edges_feat_batch_{i // batch_size + 1}.jsonl')
# edges_feat_batch_filenames.append(batch_filename)
# # Write the batch to a file
# with open(batch_filename, 'w', encoding='utf-8') as f:
# for idx, text in enumerate(batch_docs):
# record = {
# "custom_id": f"text_{batch_doc_ids[idx]}",
# "method": "POST",
# "url": "/v1/embeddings",
# "body": {
# "model": "text-embedding-ada-002",
# "input": text
# }
# }
# f.write(json.dumps(record) + '\n')
# client = OpenAI()
# # Keep track of all batch metadata
# submitted_batches = submitted_batches[:19]
# # Loop through each batch file and submit to OpenAI
# for batch_filename in edges_feat_batch_filenames[20:35]:
# # Upload the file
# print(f"Uploading file: {batch_filename}")
# uploaded_file = client.files.create(
# file=open(batch_filename, "rb"),
# purpose="batch"
# )
# print(f"Uploaded file ID: {uploaded_file.id}")
# # Create a batch job
# batch_job = client.batches.create(
# input_file_id=uploaded_file.id,
# endpoint="/v1/embeddings",
# completion_window="24h" # Options: "24h", "1h"
# )
# print(f"Submitted batch ID: {batch_job.id}")
# submitted_batches.append({
# "batch_filename": batch_filename,
# "file_id": uploaded_file.id,
# "batch_id": batch_job.id,
# "status": batch_job.status
# })
# # Optional short pause to avoid rate limits
# time.sleep(2)
# # Save metadata for all submitted batches
# batch_metadata_df = pd.DataFrame(submitted_batches)
# batch_metadata_df.to_csv(os.path.join(output_dir, "edges_feat_submitted_batches_metadata.csv"), index=False)
# print("Saved batch tracking metadata for nodes.")
# batch_metadata_df
# Load metadata of the submitted batches
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/'
batch_metadata_df = pd.read_csv(os.path.join(output_dir, "edges_feat_submitted_batches_metadata.csv"))
batch_metadata_df
batch_filename | file_id | batch_id | status | |
---|---|---|---|---|
0 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-SGaEU9hKVGKBeWGbjdts15 | batch_6834763f12a08190b8231a130d191d99 | completed |
1 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-FP5Ghm8W8K6eeV2voEk6ED | batch_6834765927e8819081937361c97c9fb1 | completed |
2 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-MWznAz1mmBSxf7ikDR5Qsg | batch_6834767293b88190845d90932b431330 | completed |
3 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-DUQ1LBHXd6rcUMBEFw5tx5 | batch_6834768abab48190aced7409d3a81f17 | completed |
4 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-JwHk17WXvwXRnWPJZTtNfd | batch_683476a5af748190b352964ac372f628 | completed |
... | ... | ... | ... | ... |
77 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-Gn87X3H8tnQGqFqRthWA5x | batch_68359cfc60d88190a781b016f1509e05 | completed |
78 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-PA66vUvDsg5BPuFtoTGxfT | batch_6835a5115918819090981db638124c4f | completed |
79 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-Lgk3xVfkhHUeW3xGxh1dx7 | batch_6835a55814ec8190bc327bb81944648f | completed |
80 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-8nREooXTcbF1YHHzqK54db | batch_6835a5fa8e2c819090eab1216e5daadb | completed |
81 | ../../../aiagents4pharma/talk2knowledgegraphs/... | file-WTqdzqeE9LdibBudPCM2v1 | batch_6836f9e4bc208190ba28ac24a9c8ac2e | completed |
82 rows × 4 columns
# # We can skip this step if we already store the embeddings in parquet files
# output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/results/'
# output_files = [os.path.join(output_dir, f'{b}_output.jsonl') for b in batch_metadata_df.batch_id.values.tolist()]
# output_files
# # os.makedirs(output_dir, exist_ok=True)
# # filename = batch_metadata_df.iloc[0].batch_id
# for b_id in batch_metadata_df.batch_id.values.tolist():
# print(f"Processing batch ID: {b_id}")
# output_file = os.path.join(output_dir, f'{b_id}_output.jsonl')
# embeddings = []
# with open(output_file, 'r', encoding='utf-8') as f:
# for line in f:
# record = json.loads(line)
# custom_id = record.get('custom_id')
# embedding = record['response']['body']['data'][0]['embedding']
# embeddings.append([custom_id, embedding])
# embeddings_df = pd.DataFrame(embeddings, columns=['custom_id', 'embedding']).to_parquet(
# os.path.join(output_dir, f'{b_id}.parquet.gzip'),
# compression='gzip',
# index=False
# )
Processing batch ID: batch_6834763f12a08190b8231a130d191d99 Processing batch ID: batch_6834765927e8819081937361c97c9fb1 Processing batch ID: batch_6834767293b88190845d90932b431330 Processing batch ID: batch_6834768abab48190aced7409d3a81f17 Processing batch ID: batch_683476a5af748190b352964ac372f628 Processing batch ID: batch_683476c2a73c81909ce7695ba9ba3fa6 Processing batch ID: batch_683476df3a2c8190bdaab39f827f4f43 Processing batch ID: batch_6834770077e0819081a5420c64259391 Processing batch ID: batch_6834771e816081909a0c0bb1a2de53a4 Processing batch ID: batch_6834773981388190b7e842b15c6f9c8b Processing batch ID: batch_6834775444dc81908b97edd53afaf084 Processing batch ID: batch_683477708a388190aa83dcd8326ba0f9 Processing batch ID: batch_6834778ed6c08190904aaa2ed0e5ae7a Processing batch ID: batch_683477ad53d081908ea985b6fc8f766e Processing batch ID: batch_683477cabb8881909bec332fe412c8ea Processing batch ID: batch_683477e6e0948190af68059ca065af7a Processing batch ID: batch_683478035fec8190b29b1172d8219c9a Processing batch ID: batch_6834782311b4819086aa426a8aa0bc8e Processing batch ID: batch_6834783da0d4819086f1b6064fa78bcc Processing batch ID: batch_6834a0d901988190ad75bad26bc43417 Processing batch ID: batch_6834a138e4488190b8b2faf9b4550b19 Processing batch ID: batch_6834a18048ac8190b8ab7f8070742f1a Processing batch ID: batch_6834a7ec9b708190a24951059dbcdc6a Processing batch ID: batch_6834a82f07fc819090eb773573063233 Processing batch ID: batch_6834a86d42d88190b8980bf13493d642 Processing batch ID: batch_6834a8a8b2788190a844f1995ba02cfe Processing batch ID: batch_6834a8eab43c8190a33db88de1e91479 Processing batch ID: batch_6834a91e2ad081908c02b146cd389503 Processing batch ID: batch_6834a9544cbc8190b0df565440bac021 Processing batch ID: batch_6834a99291548190870d80952d2395ab Processing batch ID: batch_6834a9cd4c408190a9d62f15d2e61e48 Processing batch ID: batch_6834aa077cd88190bdfaf85c274afa25 Processing batch ID: batch_6834aa46d9b08190ae216a1ee41ee6c6 Processing batch ID: batch_6834ab66f40881909ab8e917063399eb Processing batch ID: batch_68356cd4b3a081908bb1e0226083589f Processing batch ID: batch_6834d0800e1c8190ad252c7fc9db1ec8 Processing batch ID: batch_6834d0c1fb2881909e20076f6018608d Processing batch ID: batch_6834d0ec71a081908c3f49b4449232f4 Processing batch ID: batch_68352809b66c8190b4bf7a6a4048e3d7 Processing batch ID: batch_683528504e5c8190911ab798a7002ab2 Processing batch ID: batch_683528d069f48190b87f60b2c45faf22 Processing batch ID: batch_6835291625ec819096815e36f5f65639 Processing batch ID: batch_6835295c124c8190a19c96bcd144b48f Processing batch ID: batch_68352993c8bc819085ab937ac71c3579 Processing batch ID: batch_68352a224024819087c5ad9dfe28e089 Processing batch ID: batch_68352a6c3a308190894b1b03dbec85ce Processing batch ID: batch_68352aa8b30c8190af740dd0b6e3186d Processing batch ID: batch_68352ae1a7948190a1f13079b8e4009e Processing batch ID: batch_68352b5d9f248190b3e2db61ece37667 Processing batch ID: batch_68352b9dc3d88190af151a1a76402a8b Processing batch ID: batch_68352bd874388190ac71211d7926101b Processing batch ID: batch_68352c2091748190a83198f087ac6807 Processing batch ID: batch_68352c5843a88190bd041cd674345329 Processing batch ID: batch_68352c9005dc8190855f095269619ab3 Processing batch ID: batch_68352ccc68748190a7840b7a6f7f26e3 Processing batch ID: batch_68352d143da48190a5d6e24594ced127 Processing batch ID: batch_68352df6e4388190b3730b097dcde1d0 Processing batch ID: batch_68356c41972481909b57df91b416e3ac Processing batch ID: batch_68357c05df448190afa1b1ed6eeecec6 Processing batch ID: batch_68357cb5f38c8190b1fbca0de8f306c1 Processing batch ID: batch_68357cee5fd481909eb05f82d6705de3 Processing batch ID: batch_68357d3ab9808190954a226be8036e96 Processing batch ID: batch_68357de4533c8190951e08b2d8ca904e Processing batch ID: batch_68357e2b8484819089e0eb56fa6e6636 Processing batch ID: batch_68357e979c048190908c931c115e3f49 Processing batch ID: batch_68357f539b288190930a4e409dcaab4c Processing batch ID: batch_6835b2fb9784819088680a1367a2a02c Processing batch ID: batch_68357f8a92f8819082b9521d9ba39b33 Processing batch ID: batch_68357fe57f948190a4f02ddb350d4205 Processing batch ID: batch_68358055be608190adb69583ee28924f Processing batch ID: batch_683580a8825c8190967880741ba6140e Processing batch ID: batch_6835b526b3948190a3273876cce0785e Processing batch ID: batch_68358107bbfc819098ce8f18af64efca Processing batch ID: batch_683581464df48190b87f00905ac9beb2 Processing batch ID: batch_6835819fed588190af54910fa44aa4ef Processing batch ID: batch_68358211fce8819087df1faecf9e4e72 Processing batch ID: batch_68358263bdc081909cbfb870793a53df Processing batch ID: batch_68359cfc60d88190a781b016f1509e05 Processing batch ID: batch_6835a5115918819090981db638124c4f Processing batch ID: batch_6835a55814ec8190bc327bb81944648f Processing batch ID: batch_6835a5fa8e2c819090eab1216e5daadb
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/results/'
dfs = []
for b_id in batch_metadata_df.batch_id.values.tolist():
with open(os.path.join(output_dir, f'{b_id}.parquet.gzip'), 'rb') as f:
df = pd.read_parquet(f)
print(f"Batch ID: {b_id}, Shape: {df.shape}")
dfs.append(df)
edges_feat_embeddings_df = pd.concat(dfs, ignore_index=True)
Batch ID: batch_6834763f12a08190b8231a130d191d99, Shape: (50000, 2) Batch ID: batch_6834765927e8819081937361c97c9fb1, Shape: (50000, 2) Batch ID: batch_6834767293b88190845d90932b431330, Shape: (50000, 2) Batch ID: batch_6834768abab48190aced7409d3a81f17, Shape: (50000, 2) Batch ID: batch_683476a5af748190b352964ac372f628, Shape: (50000, 2) Batch ID: batch_683476c2a73c81909ce7695ba9ba3fa6, Shape: (50000, 2) Batch ID: batch_683476df3a2c8190bdaab39f827f4f43, Shape: (50000, 2) Batch ID: batch_6834770077e0819081a5420c64259391, Shape: (49999, 2) Batch ID: batch_6834771e816081909a0c0bb1a2de53a4, Shape: (50000, 2) Batch ID: batch_6834773981388190b7e842b15c6f9c8b, Shape: (50000, 2) Batch ID: batch_6834775444dc81908b97edd53afaf084, Shape: (50000, 2) Batch ID: batch_683477708a388190aa83dcd8326ba0f9, Shape: (50000, 2) Batch ID: batch_6834778ed6c08190904aaa2ed0e5ae7a, Shape: (50000, 2) Batch ID: batch_683477ad53d081908ea985b6fc8f766e, Shape: (50000, 2) Batch ID: batch_683477cabb8881909bec332fe412c8ea, Shape: (50000, 2) Batch ID: batch_683477e6e0948190af68059ca065af7a, Shape: (50000, 2) Batch ID: batch_683478035fec8190b29b1172d8219c9a, Shape: (50000, 2) Batch ID: batch_6834782311b4819086aa426a8aa0bc8e, Shape: (50000, 2) Batch ID: batch_6834783da0d4819086f1b6064fa78bcc, Shape: (50000, 2) Batch ID: batch_6834a0d901988190ad75bad26bc43417, Shape: (50000, 2) Batch ID: batch_6834a138e4488190b8b2faf9b4550b19, Shape: (50000, 2) Batch ID: batch_6834a18048ac8190b8ab7f8070742f1a, Shape: (50000, 2) Batch ID: batch_6834a7ec9b708190a24951059dbcdc6a, Shape: (50000, 2) Batch ID: batch_6834a82f07fc819090eb773573063233, Shape: (50000, 2) Batch ID: batch_6834a86d42d88190b8980bf13493d642, Shape: (50000, 2) Batch ID: batch_6834a8a8b2788190a844f1995ba02cfe, Shape: (50000, 2) Batch ID: batch_6834a8eab43c8190a33db88de1e91479, Shape: (50000, 2) Batch ID: batch_6834a91e2ad081908c02b146cd389503, Shape: (50000, 2) Batch ID: batch_6834a9544cbc8190b0df565440bac021, Shape: (50000, 2) Batch ID: batch_6834a99291548190870d80952d2395ab, Shape: (50000, 2) Batch ID: batch_6834a9cd4c408190a9d62f15d2e61e48, Shape: (50000, 2) Batch ID: batch_6834aa077cd88190bdfaf85c274afa25, Shape: (50000, 2) Batch ID: batch_6834aa46d9b08190ae216a1ee41ee6c6, Shape: (50000, 2) Batch ID: batch_6834ab66f40881909ab8e917063399eb, Shape: (50000, 2) Batch ID: batch_68356cd4b3a081908bb1e0226083589f, Shape: (50000, 2) Batch ID: batch_6834d0800e1c8190ad252c7fc9db1ec8, Shape: (50000, 2) Batch ID: batch_6834d0c1fb2881909e20076f6018608d, Shape: (50000, 2) Batch ID: batch_6834d0ec71a081908c3f49b4449232f4, Shape: (50000, 2) Batch ID: batch_68352809b66c8190b4bf7a6a4048e3d7, Shape: (50000, 2) Batch ID: batch_683528504e5c8190911ab798a7002ab2, Shape: (50000, 2) Batch ID: batch_683528d069f48190b87f60b2c45faf22, Shape: (50000, 2) Batch ID: batch_6835291625ec819096815e36f5f65639, Shape: (50000, 2) Batch ID: batch_6835295c124c8190a19c96bcd144b48f, Shape: (50000, 2) Batch ID: batch_68352993c8bc819085ab937ac71c3579, Shape: (50000, 2) Batch ID: batch_68352a224024819087c5ad9dfe28e089, Shape: (50000, 2) Batch ID: batch_68352a6c3a308190894b1b03dbec85ce, Shape: (50000, 2) Batch ID: batch_68352aa8b30c8190af740dd0b6e3186d, Shape: (50000, 2) Batch ID: batch_68352ae1a7948190a1f13079b8e4009e, Shape: (50000, 2) Batch ID: batch_68352b5d9f248190b3e2db61ece37667, Shape: (49999, 2) Batch ID: batch_68352b9dc3d88190af151a1a76402a8b, Shape: (50000, 2) Batch ID: batch_68352bd874388190ac71211d7926101b, Shape: (50000, 2) Batch ID: batch_68352c2091748190a83198f087ac6807, Shape: (50000, 2) Batch ID: batch_68352c5843a88190bd041cd674345329, Shape: (50000, 2) Batch ID: batch_68352c9005dc8190855f095269619ab3, Shape: (50000, 2) Batch ID: batch_68352ccc68748190a7840b7a6f7f26e3, Shape: (50000, 2) Batch ID: batch_68352d143da48190a5d6e24594ced127, Shape: (50000, 2) Batch ID: batch_68352df6e4388190b3730b097dcde1d0, Shape: (50000, 2) Batch ID: batch_68356c41972481909b57df91b416e3ac, Shape: (50000, 2) Batch ID: batch_68357c05df448190afa1b1ed6eeecec6, Shape: (50000, 2) Batch ID: batch_68357cb5f38c8190b1fbca0de8f306c1, Shape: (50000, 2) Batch ID: batch_68357cee5fd481909eb05f82d6705de3, Shape: (50000, 2) Batch ID: batch_68357d3ab9808190954a226be8036e96, Shape: (50000, 2) Batch ID: batch_68357de4533c8190951e08b2d8ca904e, Shape: (50000, 2) Batch ID: batch_68357e2b8484819089e0eb56fa6e6636, Shape: (50000, 2) Batch ID: batch_68357e979c048190908c931c115e3f49, Shape: (50000, 2) Batch ID: batch_68357f539b288190930a4e409dcaab4c, Shape: (49999, 2) Batch ID: batch_6835b2fb9784819088680a1367a2a02c, Shape: (1, 2) Batch ID: batch_68357f8a92f8819082b9521d9ba39b33, Shape: (50000, 2) Batch ID: batch_68357fe57f948190a4f02ddb350d4205, Shape: (50000, 2) Batch ID: batch_68358055be608190adb69583ee28924f, Shape: (50000, 2) Batch ID: batch_683580a8825c8190967880741ba6140e, Shape: (49999, 2) Batch ID: batch_6835b526b3948190a3273876cce0785e, Shape: (1, 2) Batch ID: batch_68358107bbfc819098ce8f18af64efca, Shape: (50000, 2) Batch ID: batch_683581464df48190b87f00905ac9beb2, Shape: (50000, 2) Batch ID: batch_6835819fed588190af54910fa44aa4ef, Shape: (50000, 2) Batch ID: batch_68358211fce8819087df1faecf9e4e72, Shape: (50000, 2) Batch ID: batch_68358263bdc081909cbfb870793a53df, Shape: (50000, 2) Batch ID: batch_68359cfc60d88190a781b016f1509e05, Shape: (50000, 2) Batch ID: batch_6835a5115918819090981db638124c4f, Shape: (50000, 2) Batch ID: batch_6835a55814ec8190bc327bb81944648f, Shape: (50000, 2) Batch ID: batch_6835a5fa8e2c819090eab1216e5daadb, Shape: (4611, 2) Batch ID: batch_6836f9e4bc208190ba28ac24a9c8ac2e, Shape: (1, 2)
import gc
dfs = []
del dfs
gc.collect()
10
edges_feat_embeddings_df['head_index'] = edges_feat_embeddings_df['custom_id'].apply(lambda x: int(x.split('_')[1]))
edges_feat_embeddings_df['tail_index'] = edges_feat_embeddings_df['custom_id'].apply(lambda x: int(x.split('_')[2]))
edges_feat_embeddings_df['display_relation'] = edges_feat_embeddings_df['custom_id'].apply(lambda x: x.split('_')[3])
edges_feat_embeddings_df.drop(columns=['custom_id'], inplace=True)
edges_feat_embeddings_df.head(5)
embedding | head_index | tail_index | display_relation | |
---|---|---|---|---|
0 | [-0.01934238, 0.0011752498, 0.004431808, -0.03... | 0 | 8889 | ppi |
1 | [-0.01459289, 0.0035886865, 0.013382328, -0.03... | 1 | 2798 | ppi |
2 | [-0.016827235, -0.0052953544, 0.0059216865, -0... | 2 | 5646 | ppi |
3 | [-0.012488328, -0.019190352, 0.0069740876, -0.... | 3 | 11592 | ppi |
4 | [-0.00816904, -0.016246071, 0.011198749, -0.03... | 4 | 2122 | ppi |
edges_df = edges_df.merge(edges_feat_embeddings_df, on=['head_index', 'tail_index', 'display_relation'], how='left')
edges_df.rename(columns={'embedding': 'feat_emb'}, inplace=True)
edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | edge_type_str | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | 8889 | KIF15 | NCBI | 56992 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... | [-0.01934238, 0.0011752498, 0.004431808, -0.03... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | 2798 | PNMA1 | NCBI | 9240 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... | [-0.01459289, 0.0035886865, 0.013382328, -0.03... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | 5646 | TTC33 | NCBI | 23548 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... | [-0.016827235, -0.0052953544, 0.0059216865, -0... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | 11592 | MAN1B1 | NCBI | 11253 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... | [-0.012488328, -0.019190352, 0.0069740876, -0.... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | 2122 | RGS20 | NCBI | 8601 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... | [-0.00816904, -0.016246071, 0.011198749, -0.03... |
# # We just downloaded the ouputs from the OpenAI API, so we can load them directly
# output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/preprocessing/results/'
# output_files = [os.path.join(output_dir, f'{b}_output.jsonl') for b in batch_metadata_df.batch_id.values.tolist()]
# output_files
# # # Load the embeddings from the output files
# # def load_embeddings_from_output_files(output_files):
# # embeddings = []
# # for output_file in output_files:
# # with open(output_file, 'r', encoding='utf-8') as f:
# # for line in f:
# # record = json.loads(line)
# # custom_id = record.get('custom_id')
# # embedding = record['response']['body']['data'][0]['embedding']
# # embeddings.append([custom_id, embedding])
# # return pd.DataFrame(embeddings, columns=['custom_id', 'embedding'])
# # Load the embeddings
# edges_embeddings_df = load_embeddings_from_output_files(output_files)
# # Check the shape of the embeddings DataFrame
# print(f"Shape of nodes_desc_embeddings_df: {edges_embeddings_df.shape}")
# print(f"Shape of nodes_df: {edges_df.shape}")
edges_df.shape
(3623256, 15)
# for i, b_id in enumerate(batch_metadata_df.batch_id.values.tolist()):
# print(b_id, check_batch_status(b_id))
# time.sleep(2)
After that, we perform the same embedding process for the edges using Ollama model.
# # Perform embedding using NVIDIA embeddings
# emb_model = NVIDIAEmbeddings(
# model="nvidia/llama-3.2-nv-embedqa-1b-v2",
# base_url="http://localhost:8000/v1"
# )
#
# # Since the records of edges has large amount of data, we will split them into mini-batches
# mini_batch_size = 100
# edge_embeddings = []
# for i in tqdm(range(0, edges_df.shape[0], mini_batch_size)):
# outputs = emb_model.embed_documents(edges_df.enriched_edge.values.tolist()[i:i+mini_batch_size])
# edge_embeddings.extend(outputs)
# # Add them as features to the dataframe
# edges_df['edge_attr'] = edge_embeddings
# # Using nomic-ai/nomic-embed-text-v1.5 model via Ollama
# emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')
# # Populate the edge embeddings dictionary
# edge_embeddings_keys = edges_df.edge_type.unique().tolist()
# edge_embeddings = emb_model.embed_documents([str(e) for e in edge_embeddings_keys])
# edge_embeddings_dict = dict(zip(edge_embeddings_keys, edge_embeddings))
# edges_df['edge_emb'] = edges_df.apply(lambda x: edge_embeddings_dict[x.edge_type], axis=1)
# edges_df.head(5)
head_index | head_name | head_source | head_id | head_type | tail_index | tail_name | tail_source | tail_id | tail_type | display_relation | relation | edge_type | edge_type_str | edge_emb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | NCBI | 9796 | gene/protein | 8889 | KIF15 | NCBI | 56992 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | [0.024646243, 0.04494511, -0.13975705, -0.0281... |
1 | 1 | GPANK1 | NCBI | 7918 | gene/protein | 2798 | PNMA1 | NCBI | 9240 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | [0.024646243, 0.04494511, -0.13975705, -0.0281... |
2 | 2 | ZRSR2 | NCBI | 8233 | gene/protein | 5646 | TTC33 | NCBI | 23548 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | [0.024646243, 0.04494511, -0.13975705, -0.0281... |
3 | 3 | NRF1 | NCBI | 4899 | gene/protein | 11592 | MAN1B1 | NCBI | 11253 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | [0.024646243, 0.04494511, -0.13975705, -0.0281... |
4 | 4 | PI4KA | NCBI | 5297 | gene/protein | 2122 | RGS20 | NCBI | 8601 | gene/protein | ppi | protein_protein | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | [0.024646243, 0.04494511, -0.13975705, -0.0281... |
# Drop and rename several columns
edges_df.drop(columns=['head_source', 'head_id', 'head_type', 'tail_source', 'tail_id', 'tail_type', 'relation'], inplace=True)
edges_df.rename(columns={'head_index': 'primekg_head_index', 'tail_index': 'primekg_tail_index'}, inplace=True)
# Check dataframe of edges
edges_df.head(5)
primekg_head_index | head_name | primekg_tail_index | tail_name | display_relation | edge_type | edge_type_str | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | PHYHIP | 8889 | KIF15 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... | [-0.01934238, 0.0011752498, 0.004431808, -0.03... |
1 | 1 | GPANK1 | 2798 | PNMA1 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... | [-0.01459289, 0.0035886865, 0.013382328, -0.03... |
2 | 2 | ZRSR2 | 5646 | TTC33 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... | [-0.016827235, -0.0052953544, 0.0059216865, -0... |
3 | 3 | NRF1 | 11592 | MAN1B1 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... | [-0.012488328, -0.019190352, 0.0069740876, -0.... |
4 | 4 | PI4KA | 2122 | RGS20 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... | [-0.00816904, -0.016246071, 0.011198749, -0.03... |
# Make an additional edge index column as identifier
edges_df.reset_index(inplace=True)
edges_df.rename(columns={'index': 'triplet_index'}, inplace=True)
edges_df.head(5)
triplet_index | primekg_head_index | head_name | primekg_tail_index | tail_name | display_relation | edge_type | edge_type_str | feat | feat_emb | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | PHYHIP | 8889 | KIF15 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... | [-0.01934238, 0.0011752498, 0.004431808, -0.03... |
1 | 1 | 1 | GPANK1 | 2798 | PNMA1 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... | [-0.01459289, 0.0035886865, 0.013382328, -0.03... |
2 | 2 | 2 | ZRSR2 | 5646 | TTC33 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... | [-0.016827235, -0.0052953544, 0.0059216865, -0... |
3 | 3 | 3 | NRF1 | 11592 | MAN1B1 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... | [-0.012488328, -0.019190352, 0.0069740876, -0.... |
4 | 4 | 4 | PI4KA | 2122 | RGS20 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... | [-0.00816904, -0.016246071, 0.011198749, -0.03... |
# Modify the edge dataframe
edges_df["head_id"] = edges_df.apply(lambda x: f"{x.head_name}_({x.primekg_head_index})", axis=1)
edges_df["tail_id"] = edges_df.apply(lambda x: f"{x.tail_name}_({x.primekg_tail_index})", axis=1)
edges_df.drop(columns=['head_name', 'tail_name'], inplace=True)
edges_df.reset_index(drop=True, inplace=True)
edges_df.head(5)
triplet_index | primekg_head_index | primekg_tail_index | display_relation | edge_type | edge_type_str | feat | feat_emb | head_id | tail_id | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 8889 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... | [-0.01934238, 0.0011752498, 0.004431808, -0.03... | PHYHIP_(0) | KIF15_(8889) |
1 | 1 | 1 | 2798 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... | [-0.01459289, 0.0035886865, 0.013382328, -0.03... | GPANK1_(1) | PNMA1_(2798) |
2 | 2 | 2 | 5646 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... | [-0.016827235, -0.0052953544, 0.0059216865, -0... | ZRSR2_(2) | TTC33_(5646) |
3 | 3 | 3 | 11592 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... | [-0.012488328, -0.019190352, 0.0069740876, -0.... | NRF1_(3) | MAN1B1_(11592) |
4 | 4 | 4 | 2122 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... | [-0.00816904, -0.016246071, 0.011198749, -0.03... | PI4KA_(4) | RGS20_(2122) |
# Add index columns for head and tail nodes
# Map head_id to head_index
edges_df = edges_df.merge(
nodes_df[["node_index", "node_id"]].to_pandas(),
left_on="head_id",
right_on="node_id",
how="left"
).rename(columns={"node_index": "head_index"}).drop(columns=["node_id"])
# Merge to get tail_index
edges_df = edges_df.merge(
nodes_df[["node_index", "node_id"]].to_pandas(),
left_on="tail_id",
right_on="node_id",
how="left"
).rename(columns={"node_index": "tail_index"}).drop(columns=["node_id"])
# Check the final edges dataframe
edges_df.head(5)
triplet_index | primekg_head_index | primekg_tail_index | display_relation | edge_type | edge_type_str | feat | feat_emb | head_id | tail_id | head_index | tail_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 8889 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PHYHIP (gene/protein) has a direct relationshi... | [-0.01934238, 0.0011752498, 0.004431808, -0.03... | PHYHIP_(0) | KIF15_(8889) | 0 | 8816 |
1 | 1 | 1 | 2798 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | GPANK1 (gene/protein) has a direct relationshi... | [-0.01459289, 0.0035886865, 0.013382328, -0.03... | GPANK1_(1) | PNMA1_(2798) | 1 | 2787 |
2 | 2 | 2 | 5646 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | ZRSR2 (gene/protein) has a direct relationship... | [-0.016827235, -0.0052953544, 0.0059216865, -0... | ZRSR2_(2) | TTC33_(5646) | 2 | 5610 |
3 | 3 | 3 | 11592 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | NRF1 (gene/protein) has a direct relationship ... | [-0.012488328, -0.019190352, 0.0069740876, -0.... | NRF1_(3) | MAN1B1_(11592) | 3 | 11467 |
4 | 4 | 4 | 2122 | ppi | (gene/protein, ppi, gene/protein) | gene/protein|ppi|gene/protein | PI4KA (gene/protein) has a direct relationship... | [-0.00816904, -0.016246071, 0.011198749, -0.03... | PI4KA_(4) | RGS20_(2122) | 4 | 2117 |
# Store node dataframe into two separated files: enrichment and embedding
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/edges/'
os.makedirs(local_dir, exist_ok=True)
# Enrichment
os.makedirs(os.path.join(local_dir, 'enrichment'), exist_ok=True)
edges_df[
["triplet_index", "primekg_head_index", "primekg_tail_index", "head_id", "tail_id", "display_relation", "edge_type", "edge_type_str", "head_index", "tail_index", "feat"]
].to_parquet(
os.path.join(local_dir, 'enrichment', "edges.parquet.gzip"),
compression='gzip',
index=False
)
# Update edges_df
edges_df = edges_df[
["triplet_index", "head_index", "tail_index", "edge_type_str", "feat_emb"]
].rename(columns={"feat_emb": "edge_emb"})
edges_df.head(5)
triplet_index | head_index | tail_index | edge_type_str | edge_emb | |
---|---|---|---|---|---|
0 | 0 | 0 | 8816 | gene/protein|ppi|gene/protein | [-0.01934238, 0.0011752498, 0.004431808, -0.03... |
1 | 1 | 1 | 2787 | gene/protein|ppi|gene/protein | [-0.01459289, 0.0035886865, 0.013382328, -0.03... |
2 | 2 | 2 | 5610 | gene/protein|ppi|gene/protein | [-0.016827235, -0.0052953544, 0.0059216865, -0... |
3 | 3 | 3 | 11467 | gene/protein|ppi|gene/protein | [-0.012488328, -0.019190352, 0.0069740876, -0.... |
4 | 4 | 4 | 2117 | gene/protein|ppi|gene/protein | [-0.00816904, -0.016246071, 0.011198749, -0.03... |
# Store edge embeddings into a separate file
# edge_embeddings_df = pd.DataFrame(
# edge_embeddings_dict.items(),
# columns=['edge_type', 'edge_emb']
# )
# edge_embeddings_df['edge_type_str'] = edge_embeddings_df.apply(lambda x: f"{x.edge_type[0]}|{x.edge_type[1]}|{x.edge_type[2]}", axis=1)
# edge_embeddings_df = edge_embeddings_df[['edge_type', 'edge_type_str', 'edge_emb']]
# edge_embeddings_df
# Embedding
os.makedirs(os.path.join(local_dir, 'embedding'), exist_ok=True)
mini_batch_size = 50000
# Store the edges dataframe into a parquet file
for i in range(0, edges_df.shape[0], mini_batch_size):
edges_df[i:i+mini_batch_size].to_parquet(
os.path.join(local_dir, 'embedding', f"edges_{i}.parquet.gzip"),
compression='gzip',
index=False
)
# edges_df[
# ["triplet_index", "head_index", "tail_index", "edge_type_str", "feat_emb"]
# ].rename(columns={"feat_emb": "edge_emb"}).to_parquet(
# os.path.join(local_dir, 'embedding', "edges.parquet.gzip"),
# compression='gzip',
# index=False
# )
# Check the number of nodes and edges
print(f"Number of nodes: {len(nodes_df)}")
print(f"Number of edges: {len(edges_df)}")
Number of nodes: 84981 Number of edges: 3904610