UniProt ID Mapping

In [1]:

Copied!





# Import necessary libraries
import time
import json
import zlib
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, parse_qs, urlencode
import pandas as pd
import os
import pickle
# Import necessary libraries
import time
import json
import zlib
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, parse_qs, urlencode
import pandas as pd
import os
import pickle

In [2]:

Copied!





# Define variables to perform UniProt ID mapping
# Adopted from https://www.uniprot.org/help/id_mapping
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 5
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def submit_id_mapping(from_db, to_db, ids) -> str:
    """
    Function to submit a job to perform ID mapping.

    Args:
        from_db (str): The source database.
        to_db (str): The target database.
        ids (list): The list of IDs to map.

    Returns:
        str: The job ID.
    """
    request = requests.post(f"{API_URL}/idmapping/run",
                            data={"from": from_db,
                                  "to": to_db,
                                  "ids": ",".join(ids)},)
    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["jobId"]

def check_id_mapping_results_ready(job_id):
    """
    Function to check if the ID mapping results are ready.

    Args:
        job_id (str): The job ID.

    Returns:
        bool: True if the results are ready, False otherwise.
    """
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")

        try:
            request.raise_for_status()
        except requests.HTTPError:
            print(request.json())
            raise

        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] in ("NEW", "RUNNING"):
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])

def get_id_mapping_results_link(job_id):
    """
    Function to get the link to the ID mapping results.

    Args:
        job_id (str): The job ID.

    Returns:
        str: The link to the ID mapping results.
    """
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = requests.Session().get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["redirectURL"]

def decode_results(response, file_format, compressed):
    """
    Function to decode the ID mapping results.

    Args:
        response (requests.Response): The response object.
        file_format (str): The file format of the results.
        compressed (bool): Whether the results are compressed.

    Returns:
        str: The ID mapping results
    """

    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text

def get_id_mapping_results_stream(url):
    """
    Function to get the ID mapping results from a stream.

    Args:
        url (str): The URL to the ID mapping results.

    Returns:
        str: The ID mapping results.
    """
    if "/stream/" not in url:
        url = url.replace("/results/", "/results/stream/")

    request = session.get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)
# Define variables to perform UniProt ID mapping
# Adopted from https://www.uniprot.org/help/id_mapping
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 5
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def submit_id_mapping(from_db, to_db, ids) -> str:
    """
    Function to submit a job to perform ID mapping.

    Args:
        from_db (str): The source database.
        to_db (str): The target database.
        ids (list): The list of IDs to map.

    Returns:
        str: The job ID.
    """
    request = requests.post(f"{API_URL}/idmapping/run",
                            data={"from": from_db,
                                  "to": to_db,
                                  "ids": ",".join(ids)},)
    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["jobId"]

def check_id_mapping_results_ready(job_id):
    """
    Function to check if the ID mapping results are ready.

    Args:
        job_id (str): The job ID.

    Returns:
        bool: True if the results are ready, False otherwise.
    """
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")

        try:
            request.raise_for_status()
        except requests.HTTPError:
            print(request.json())
            raise

        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] in ("NEW", "RUNNING"):
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])

def get_id_mapping_results_link(job_id):
    """
    Function to get the link to the ID mapping results.

    Args:
        job_id (str): The job ID.

    Returns:
        str: The link to the ID mapping results.
    """
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = requests.Session().get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["redirectURL"]

def decode_results(response, file_format, compressed):
    """
    Function to decode the ID mapping results.

    Args:
        response (requests.Response): The response object.
        file_format (str): The file format of the results.
        compressed (bool): Whether the results are compressed.

    Returns:
        str: The ID mapping results
    """

    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text

def get_id_mapping_results_stream(url):
    """
    Function to get the ID mapping results from a stream.

    Args:
        url (str): The URL to the ID mapping results.

    Returns:
        str: The ID mapping results.
    """
    if "/stream/" not in url:
        url = url.replace("/results/", "/results/stream/")

    request = session.get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)

In [3]:

Copied!





# Submit a job to perform ID mapping
inputs = ['6774', '3569']
job_id = submit_id_mapping(
    from_db="GeneID", to_db="UniProtKB", ids=inputs
)
# Submit a job to perform ID mapping
inputs = ['6774', '3569']
job_id = submit_id_mapping(
    from_db="GeneID", to_db="UniProtKB", ids=inputs
)

In [4]:

Copied!

# Print the job ID
print(job_id)
# Print the job ID
print(job_id)

8556e200d5f3bb6ab102e25e58225fa49fa05e88

In [5]:

Copied!





# Check and get the ID mapping results
if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    mapping_results = get_id_mapping_results_stream(link)
# Check and get the ID mapping results
if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    mapping_results = get_id_mapping_results_stream(link)

In [6]:

Copied!





# Save the results to a pickle file
local_dir = '../../../../data/primekg_ibd/'
if not os.path.exists(local_dir):
    os.makedirs(local_dir)
with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:
    pickle.dump(mapping_results["results"], f)
# Save the results to a pickle file
local_dir = '../../../../data/primekg_ibd/'
if not os.path.exists(local_dir):
    os.makedirs(local_dir)
with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:
    pickle.dump(mapping_results["results"], f)

In [7]:

Copied!

# Convert mapping results to a dataframe
protein_mapped_df = pd.DataFrame(mapping_results["results"])
protein_mapped_df.head()
# Convert mapping results to a dataframe
protein_mapped_df = pd.DataFrame(mapping_results["results"])
protein_mapped_df.head()

Out[7]:

	from	to
0	6774	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
1	6774	{'entryType': 'UniProtKB unreviewed (TrEMBL)',...
2	3569	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
3	3569	{'entryType': 'UniProtKB unreviewed (TrEMBL)',...
4	3569	{'entryType': 'UniProtKB unreviewed (TrEMBL)',...

In [8]:

Copied!

# Checking duplicated entries based on their entryType
protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)
# Checking duplicated entries based on their entryType
protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)

Out[8]:

UniProtKB unreviewed (TrEMBL)      5
UniProtKB reviewed (Swiss-Prot)    2
Name: count, dtype: int64

In [9]:

Copied!





# There are two entryType. We choose the reviewed one.
protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']
protein_reviewed_df.reset_index(drop=True, inplace=True)
protein_reviewed_df.head()
# There are two entryType. We choose the reviewed one.
protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']
protein_reviewed_df.reset_index(drop=True, inplace=True)
protein_reviewed_df.head()

Out[9]:

	from	to
0	6774	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
1	3569	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...

In [10]:

Copied!

for key in protein_reviewed_df['to'][0].keys():
    protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
protein_reviewed_df.head()
for key in protein_reviewed_df['to'][0].keys():
    protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
protein_reviewed_df.head()

C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
C:\Users\mulyadi\AppData\Local\Temp\ipykernel_8372\1443167319.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]

Out[10]:

	from	to	entryType	primaryAccession	secondaryAccessions	uniProtkbId	entryAudit	annotationScore	organism	proteinExistence	proteinDescription	genes	comments	features	keywords	references	uniProtKBCrossReferences	sequence	extraAttributes
0	6774	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...	UniProtKB reviewed (Swiss-Prot)	P40763	[A8K7B8, K7ENL3, O14916, Q9BW54]	STAT3_HUMAN	{'firstPublicDate': '1995-02-01', 'lastAnnotat...	5.0	{'scientificName': 'Homo sapiens', 'commonName...	1: Evidence at protein level	{'recommendedName': {'fullName': {'evidences':...	[{'geneName': {'evidences': [{'evidenceCode': ...	[{'texts': [{'evidences': [{'evidenceCode': 'E...	[{'type': 'Initiator methionine', 'location': ...	[{'id': 'KW-0002', 'category': 'Technical term...	[{'referenceNumber': 1, 'citation': {'id': '75...	[{'database': 'EMBL', 'id': 'L29277', 'propert...	{'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA...	{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...
1	3569	{'entryType': 'UniProtKB reviewed (Swiss-Prot)...	UniProtKB reviewed (Swiss-Prot)	P05231	[Q9UCU2, Q9UCU3, Q9UCU4]	IL6_HUMAN	{'firstPublicDate': '1987-08-13', 'lastAnnotat...	5.0	{'scientificName': 'Homo sapiens', 'commonName...	1: Evidence at protein level	{'recommendedName': {'fullName': {'evidences':...	[{'geneName': {'evidences': [{'evidenceCode': ...	[{'texts': [{'evidences': [{'evidenceCode': 'E...	[{'type': 'Signal', 'location': {'start': {'va...	[{'id': 'KW-0002', 'category': 'Technical term...	[{'referenceNumber': 1, 'citation': {'id': '34...	[{'database': 'EMBL', 'id': 'X04430', 'propert...	{'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED...	{'countByCommentType': {'FUNCTION': 3, 'SUBUNI...