Cell Annotation

In [ ]:

Copied!





# import libraries
import numpy as np
import pandas as pd
import scanpy as sc
import plotly.express as px
# import libraries
import numpy as np
import pandas as pd
import scanpy as sc
import plotly.express as px

In [ ]:

Copied!





# Function to annotate cells
def annotate_cells(data, method="custom_gene_list", gene_list=None, annotation_map=None, external_tool="CIBERSORT", database_results=None):
    """
    Annotates cells based on single-cell gene expression data.

    Parameters:
    - data: AnnData object or DataFrame containing single-cell gene expression data.
    - method: "custom_gene_list" or "external_database".
    - gene_list: List of marker genes (required if method="custom_gene_list").
    - annotation_map: Dictionary mapping genes to cell types (required if method="custom_gene_list").
    - external_tool: Name of the external database/tool (e.g., "CIBERSORT", "ImmuneCellAI").
    - database_results: Results from an external database (required if method="external_database").

    Returns:
    - data: Annotated AnnData/DataFrame with a new column "annotations".
    """
    if method == "custom_gene_list":
        if gene_list is None or annotation_map is None:
            raise ValueError("A gene list and annotation map must be provided for the 'custom_gene_list' method.")
        print("Annotating cells using custom gene list...")

        # Annotate cells by identifying the dominant cell type based on marker genes
        annotations = []
        for cell_idx in range(data.shape[0]):
            cell_annotation = None
            for gene in gene_list:
                if gene in data.var_names:
                    if data[cell_idx, gene].X > 1:  # Example threshold for gene expression
                        cell_annotation = annotation_map.get(gene, "Unknown")
                        break  # Assign the first matching annotation
            annotations.append(cell_annotation if cell_annotation else "Unannotated")
        data.obs["annotations"] = annotations

    elif method == "external_database":
        if database_results is None:
            raise ValueError("Database results must be provided for the 'external_database' method.")
        print(f"Annotating cells using {external_tool} results...")
        data.obs["annotations"] = database_results["cell_type"]

    else:
        raise ValueError("Invalid method. Choose either 'custom_gene_list' or 'external_database'.")

    return data
# Function to annotate cells
def annotate_cells(data, method="custom_gene_list", gene_list=None, annotation_map=None, external_tool="CIBERSORT", database_results=None):
    """
    Annotates cells based on single-cell gene expression data.

    Parameters:
    - data: AnnData object or DataFrame containing single-cell gene expression data.
    - method: "custom_gene_list" or "external_database".
    - gene_list: List of marker genes (required if method="custom_gene_list").
    - annotation_map: Dictionary mapping genes to cell types (required if method="custom_gene_list").
    - external_tool: Name of the external database/tool (e.g., "CIBERSORT", "ImmuneCellAI").
    - database_results: Results from an external database (required if method="external_database").

    Returns:
    - data: Annotated AnnData/DataFrame with a new column "annotations".
    """
    if method == "custom_gene_list":
        if gene_list is None or annotation_map is None:
            raise ValueError("A gene list and annotation map must be provided for the 'custom_gene_list' method.")
        print("Annotating cells using custom gene list...")

        # Annotate cells by identifying the dominant cell type based on marker genes
        annotations = []
        for cell_idx in range(data.shape[0]):
            cell_annotation = None
            for gene in gene_list:
                if gene in data.var_names:
                    if data[cell_idx, gene].X > 1:  # Example threshold for gene expression
                        cell_annotation = annotation_map.get(gene, "Unknown")
                        break  # Assign the first matching annotation
            annotations.append(cell_annotation if cell_annotation else "Unannotated")
        data.obs["annotations"] = annotations

    elif method == "external_database":
        if database_results is None:
            raise ValueError("Database results must be provided for the 'external_database' method.")
        print(f"Annotating cells using {external_tool} results...")
        data.obs["annotations"] = database_results["cell_type"]

    else:
        raise ValueError("Invalid method. Choose either 'custom_gene_list' or 'external_database'.")

    return data

In [ ]:

Copied!





# UMAP Visualization Function
def visualize_annotations_on_umap(data, annotations_column="annotations", n_pcs=50):
    """
    Visualizes annotations on UMAP.

    Parameters:
    - data: AnnData object or DataFrame containing UMAP coordinates and annotations.
    - annotations_column: Name of the column containing cell annotations.
    - n_pcs: Number of principal components to use for UMAP computation.

    Returns:
    - fig: Plotly figure with annotated UMAP.
    """
    # Ensure UMAP coordinates exist
    if "X_umap" not in data.obsm.keys():
        print("UMAP coordinates not found. Computing UMAP...")

        # Reduce dimensions using PCA
        print(f"Reducing dimensionality to {n_pcs} principal components...")
        sc.pp.pca(data, n_comps=n_pcs)

        # Compute neighbors and UMAP
        sc.pp.neighbors(data, use_rep="X_pca")  # Use PCA-reduced data
        sc.tl.umap(data)

    umap_df = pd.DataFrame(data.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
    umap_df[annotations_column] = data.obs[annotations_column].values

    # Create Plotly UMAP visualization
    fig = px.scatter(
        umap_df,
        x="UMAP1",
        y="UMAP2",
        color=annotations_column,
        title="UMAP with Cell Annotations",
        labels={annotations_column: "Cell Type"},
    )
    fig.update_traces(marker=dict(size=5, opacity=0.8))
    return fig
# UMAP Visualization Function
def visualize_annotations_on_umap(data, annotations_column="annotations", n_pcs=50):
    """
    Visualizes annotations on UMAP.

    Parameters:
    - data: AnnData object or DataFrame containing UMAP coordinates and annotations.
    - annotations_column: Name of the column containing cell annotations.
    - n_pcs: Number of principal components to use for UMAP computation.

    Returns:
    - fig: Plotly figure with annotated UMAP.
    """
    # Ensure UMAP coordinates exist
    if "X_umap" not in data.obsm.keys():
        print("UMAP coordinates not found. Computing UMAP...")

        # Reduce dimensions using PCA
        print(f"Reducing dimensionality to {n_pcs} principal components...")
        sc.pp.pca(data, n_comps=n_pcs)

        # Compute neighbors and UMAP
        sc.pp.neighbors(data, use_rep="X_pca")  # Use PCA-reduced data
        sc.tl.umap(data)

    umap_df = pd.DataFrame(data.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
    umap_df[annotations_column] = data.obs[annotations_column].values

    # Create Plotly UMAP visualization
    fig = px.scatter(
        umap_df,
        x="UMAP1",
        y="UMAP2",
        color=annotations_column,
        title="UMAP with Cell Annotations",
        labels={annotations_column: "Cell Type"},
    )
    fig.update_traces(marker=dict(size=5, opacity=0.8))
    return fig

In [ ]:

Copied!





# Example Usage
if __name__ == "__main__":
    # Load example single-cell data (AnnData format)
    adata = sc.datasets.pbmc3k()  # Example dataset
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # Example custom gene list and annotation map
    example_gene_list = ["CD3D", "CD79A", "LYZ"]  # Example marker genes for T-cells, B-cells, Monocytes
    annotation_map = {
        "CD3D": "T cells",
        "CD79A": "B cells",
        "LYZ": "Monocytes"
    }

    # Annotate cells
    adata = annotate_cells(adata, method="custom_gene_list", gene_list=example_gene_list, annotation_map=annotation_map)

    # Visualize annotations on UMAP
    fig = visualize_annotations_on_umap(adata, annotations_column="annotations", n_pcs=50)
    fig.show()
# Example Usage
if __name__ == "__main__":
    # Load example single-cell data (AnnData format)
    adata = sc.datasets.pbmc3k()  # Example dataset
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # Example custom gene list and annotation map
    example_gene_list = ["CD3D", "CD79A", "LYZ"]  # Example marker genes for T-cells, B-cells, Monocytes
    annotation_map = {
        "CD3D": "T cells",
        "CD79A": "B cells",
        "LYZ": "Monocytes"
    }

    # Annotate cells
    adata = annotate_cells(adata, method="custom_gene_list", gene_list=example_gene_list, annotation_map=annotation_map)

    # Visualize annotations on UMAP
    fig = visualize_annotations_on_umap(adata, annotations_column="annotations", n_pcs=50)
    fig.show()

Annotating cells using custom gene list...
UMAP coordinates not found. Computing UMAP...
Reducing dimensionality to 50 principal components...

/Users/jaydeepbhat/Documents/Hackathon/2024_AI_Agent/scripts/talk2cells/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm