UniProt ID Mapping
In [1]:
Copied!
# Import necessary libraries
import time
import json
import zlib
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, parse_qs, urlencode
import pandas as pd
import os
import pickle
# Import necessary libraries
import time
import json
import zlib
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, parse_qs, urlencode
import pandas as pd
import os
import pickle
In [2]:
Copied!
# Define variables to perform UniProt ID mapping
# Adopted from https://www.uniprot.org/help/id_mapping
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 5
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def submit_id_mapping(from_db, to_db, ids) -> str:
"""
Function to submit a job to perform ID mapping.
Args:
from_db (str): The source database.
to_db (str): The target database.
ids (list): The list of IDs to map.
Returns:
str: The job ID.
"""
request = requests.post(f"{API_URL}/idmapping/run",
data={"from": from_db,
"to": to_db,
"ids": ",".join(ids)},)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
return request.json()["jobId"]
def check_id_mapping_results_ready(job_id):
"""
Function to check if the ID mapping results are ready.
Args:
job_id (str): The job ID.
Returns:
bool: True if the results are ready, False otherwise.
"""
while True:
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
j = request.json()
if "jobStatus" in j:
if j["jobStatus"] in ("NEW", "RUNNING"):
print(f"Retrying in {POLLING_INTERVAL}s")
time.sleep(POLLING_INTERVAL)
else:
raise Exception(j["jobStatus"])
else:
return bool(j["results"] or j["failedIds"])
def get_id_mapping_results_link(job_id):
"""
Function to get the link to the ID mapping results.
Args:
job_id (str): The job ID.
Returns:
str: The link to the ID mapping results.
"""
url = f"{API_URL}/idmapping/details/{job_id}"
request = requests.Session().get(url)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
return request.json()["redirectURL"]
def decode_results(response, file_format, compressed):
"""
Function to decode the ID mapping results.
Args:
response (requests.Response): The response object.
file_format (str): The file format of the results.
compressed (bool): Whether the results are compressed.
Returns:
str: The ID mapping results
"""
if compressed:
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
if file_format == "json":
j = json.loads(decompressed.decode("utf-8"))
return j
elif file_format == "tsv":
return [line for line in decompressed.decode("utf-8").split("\n") if line]
elif file_format == "xlsx":
return [decompressed]
elif file_format == "xml":
return [decompressed.decode("utf-8")]
else:
return decompressed.decode("utf-8")
elif file_format == "json":
return response.json()
elif file_format == "tsv":
return [line for line in response.text.split("\n") if line]
elif file_format == "xlsx":
return [response.content]
elif file_format == "xml":
return [response.text]
return response.text
def get_id_mapping_results_stream(url):
"""
Function to get the ID mapping results from a stream.
Args:
url (str): The URL to the ID mapping results.
Returns:
str: The ID mapping results.
"""
if "/stream/" not in url:
url = url.replace("/results/", "/results/stream/")
request = session.get(url)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
parsed = urlparse(url)
query = parse_qs(parsed.query)
file_format = query["format"][0] if "format" in query else "json"
compressed = (
query["compressed"][0].lower() == "true" if "compressed" in query else False
)
return decode_results(request, file_format, compressed)
# Define variables to perform UniProt ID mapping
# Adopted from https://www.uniprot.org/help/id_mapping
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 5
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def submit_id_mapping(from_db, to_db, ids) -> str:
"""
Function to submit a job to perform ID mapping.
Args:
from_db (str): The source database.
to_db (str): The target database.
ids (list): The list of IDs to map.
Returns:
str: The job ID.
"""
request = requests.post(f"{API_URL}/idmapping/run",
data={"from": from_db,
"to": to_db,
"ids": ",".join(ids)},)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
return request.json()["jobId"]
def check_id_mapping_results_ready(job_id):
"""
Function to check if the ID mapping results are ready.
Args:
job_id (str): The job ID.
Returns:
bool: True if the results are ready, False otherwise.
"""
while True:
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
j = request.json()
if "jobStatus" in j:
if j["jobStatus"] in ("NEW", "RUNNING"):
print(f"Retrying in {POLLING_INTERVAL}s")
time.sleep(POLLING_INTERVAL)
else:
raise Exception(j["jobStatus"])
else:
return bool(j["results"] or j["failedIds"])
def get_id_mapping_results_link(job_id):
"""
Function to get the link to the ID mapping results.
Args:
job_id (str): The job ID.
Returns:
str: The link to the ID mapping results.
"""
url = f"{API_URL}/idmapping/details/{job_id}"
request = requests.Session().get(url)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
return request.json()["redirectURL"]
def decode_results(response, file_format, compressed):
"""
Function to decode the ID mapping results.
Args:
response (requests.Response): The response object.
file_format (str): The file format of the results.
compressed (bool): Whether the results are compressed.
Returns:
str: The ID mapping results
"""
if compressed:
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
if file_format == "json":
j = json.loads(decompressed.decode("utf-8"))
return j
elif file_format == "tsv":
return [line for line in decompressed.decode("utf-8").split("\n") if line]
elif file_format == "xlsx":
return [decompressed]
elif file_format == "xml":
return [decompressed.decode("utf-8")]
else:
return decompressed.decode("utf-8")
elif file_format == "json":
return response.json()
elif file_format == "tsv":
return [line for line in response.text.split("\n") if line]
elif file_format == "xlsx":
return [response.content]
elif file_format == "xml":
return [response.text]
return response.text
def get_id_mapping_results_stream(url):
"""
Function to get the ID mapping results from a stream.
Args:
url (str): The URL to the ID mapping results.
Returns:
str: The ID mapping results.
"""
if "/stream/" not in url:
url = url.replace("/results/", "/results/stream/")
request = session.get(url)
try:
request.raise_for_status()
except requests.HTTPError:
print(request.json())
raise
parsed = urlparse(url)
query = parse_qs(parsed.query)
file_format = query["format"][0] if "format" in query else "json"
compressed = (
query["compressed"][0].lower() == "true" if "compressed" in query else False
)
return decode_results(request, file_format, compressed)
In [3]:
Copied!
# Submit a job to perform ID mapping
inputs = ['6774', '3569']
job_id = submit_id_mapping(
from_db="GeneID", to_db="UniProtKB", ids=inputs
)
# Submit a job to perform ID mapping
inputs = ['6774', '3569']
job_id = submit_id_mapping(
from_db="GeneID", to_db="UniProtKB", ids=inputs
)
In [4]:
Copied!
# Print the job ID
print(job_id)
# Print the job ID
print(job_id)
8556e200d5f3bb6ab102e25e58225fa49fa05e88
In [5]:
Copied!
# Check and get the ID mapping results
if check_id_mapping_results_ready(job_id):
link = get_id_mapping_results_link(job_id)
mapping_results = get_id_mapping_results_stream(link)
# Check and get the ID mapping results
if check_id_mapping_results_ready(job_id):
link = get_id_mapping_results_link(job_id)
mapping_results = get_id_mapping_results_stream(link)
In [6]:
Copied!
# Save the results to a pickle file
local_dir = '../../../../data/primekg_ibd/'
if not os.path.exists(local_dir):
os.makedirs(local_dir)
with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:
pickle.dump(mapping_results["results"], f)
# Save the results to a pickle file
local_dir = '../../../../data/primekg_ibd/'
if not os.path.exists(local_dir):
os.makedirs(local_dir)
with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:
pickle.dump(mapping_results["results"], f)
In [7]:
Copied!
# Convert mapping results to a dataframe
protein_mapped_df = pd.DataFrame(mapping_results["results"])
protein_mapped_df.head()
# Convert mapping results to a dataframe
protein_mapped_df = pd.DataFrame(mapping_results["results"])
protein_mapped_df.head()
Out[7]:
from | to | |
---|---|---|
0 | 6774 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... |
1 | 6774 | {'entryType': 'UniProtKB unreviewed (TrEMBL)',... |
2 | 3569 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... |
3 | 3569 | {'entryType': 'UniProtKB unreviewed (TrEMBL)',... |
4 | 3569 | {'entryType': 'UniProtKB unreviewed (TrEMBL)',... |
In [8]:
Copied!
# Checking duplicated entries based on their entryType
protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)
# Checking duplicated entries based on their entryType
protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)
Out[8]:
UniProtKB unreviewed (TrEMBL) 5 UniProtKB reviewed (Swiss-Prot) 2 Name: count, dtype: int64
In [9]:
Copied!
# There are two entryType. We choose the reviewed one.
protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']
protein_reviewed_df.reset_index(drop=True, inplace=True)
protein_reviewed_df.head()
# There are two entryType. We choose the reviewed one.
protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']
protein_reviewed_df.reset_index(drop=True, inplace=True)
protein_reviewed_df.head()
Out[9]:
from | to | |
---|---|---|
0 | 6774 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... |
1 | 3569 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... |
In [10]:
Copied!
for key in protein_reviewed_df['to'][0].keys():
protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
protein_reviewed_df.head()
for key in protein_reviewed_df['to'][0].keys():
protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
protein_reviewed_df.head()
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']] C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
Out[10]:
from | to | entryType | primaryAccession | secondaryAccessions | uniProtkbId | entryAudit | annotationScore | organism | proteinExistence | proteinDescription | genes | comments | features | keywords | references | uniProtKBCrossReferences | sequence | extraAttributes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6774 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... | UniProtKB reviewed (Swiss-Prot) | P40763 | [A8K7B8, K7ENL3, O14916, Q9BW54] | STAT3_HUMAN | {'firstPublicDate': '1995-02-01', 'lastAnnotat... | 5.0 | {'scientificName': 'Homo sapiens', 'commonName... | 1: Evidence at protein level | {'recommendedName': {'fullName': {'evidences':... | [{'geneName': {'evidences': [{'evidenceCode': ... | [{'texts': [{'evidences': [{'evidenceCode': 'E... | [{'type': 'Initiator methionine', 'location': ... | [{'id': 'KW-0002', 'category': 'Technical term... | [{'referenceNumber': 1, 'citation': {'id': '75... | [{'database': 'EMBL', 'id': 'L29277', 'propert... | {'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA... | {'countByCommentType': {'FUNCTION': 1, 'SUBUNI... |
1 | 3569 | {'entryType': 'UniProtKB reviewed (Swiss-Prot)... | UniProtKB reviewed (Swiss-Prot) | P05231 | [Q9UCU2, Q9UCU3, Q9UCU4] | IL6_HUMAN | {'firstPublicDate': '1987-08-13', 'lastAnnotat... | 5.0 | {'scientificName': 'Homo sapiens', 'commonName... | 1: Evidence at protein level | {'recommendedName': {'fullName': {'evidences':... | [{'geneName': {'evidences': [{'evidenceCode': ... | [{'texts': [{'evidences': [{'evidenceCode': 'E... | [{'type': 'Signal', 'location': {'start': {'va... | [{'id': 'KW-0002', 'category': 'Technical term... | [{'referenceNumber': 1, 'citation': {'id': '34... | [{'database': 'EMBL', 'id': 'X04430', 'propert... | {'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED... | {'countByCommentType': {'FUNCTION': 3, 'SUBUNI... |