Cell Discovery
In [ ]:
Copied!
# import libraries
import scanpy as sc
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
# import libraries
import scanpy as sc
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
Copied!
def discover_new_annotations(data, clustering_resolution=0.8, n_pcs=50):
"""
Discovers new cell annotations using clustering algorithms and UMAP visualization.
Parameters:
- data: AnnData object containing single-cell gene expression data.
- clustering_resolution: Resolution for Leiden clustering (higher values create more clusters).
- n_pcs: Number of principal components for dimensionality reduction.
Returns:
- data: Annotated AnnData object with new clusters in `data.obs["discovered_clusters"]`.
"""
# Preprocess data
sc.pp.pca(data, n_comps=n_pcs)
sc.pp.neighbors(data, use_rep="X_pca")
# Perform clustering
sc.tl.leiden(data, resolution=clustering_resolution) # Leiden clustering algorithm
data.obs["discovered_clusters"] = data.obs["leiden"] # Save clusters to new column
# Compute UMAP
sc.tl.umap(data)
print("Clusters discovered and annotated.")
return data
def discover_new_annotations(data, clustering_resolution=0.8, n_pcs=50):
"""
Discovers new cell annotations using clustering algorithms and UMAP visualization.
Parameters:
- data: AnnData object containing single-cell gene expression data.
- clustering_resolution: Resolution for Leiden clustering (higher values create more clusters).
- n_pcs: Number of principal components for dimensionality reduction.
Returns:
- data: Annotated AnnData object with new clusters in `data.obs["discovered_clusters"]`.
"""
# Preprocess data
sc.pp.pca(data, n_comps=n_pcs)
sc.pp.neighbors(data, use_rep="X_pca")
# Perform clustering
sc.tl.leiden(data, resolution=clustering_resolution) # Leiden clustering algorithm
data.obs["discovered_clusters"] = data.obs["leiden"] # Save clusters to new column
# Compute UMAP
sc.tl.umap(data)
print("Clusters discovered and annotated.")
return data
In [ ]:
Copied!
# Visualize clusters using UMAP
def visualize_clusters_on_umap(data, cluster_column="discovered_clusters"):
"""
Visualizes discovered clusters on a UMAP plot.
Parameters:
- data: AnnData object containing UMAP coordinates and clusters.
- cluster_column: Column name containing cluster information.
Returns:
- fig: Plotly UMAP visualization.
"""
umap_df = pd.DataFrame(data.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
umap_df[cluster_column] = data.obs[cluster_column].values
fig = px.scatter(
umap_df,
x="UMAP1",
y="UMAP2",
color=cluster_column,
title="UMAP with Discovered Clusters",
labels={cluster_column: "Cluster"},
)
fig.update_traces(marker=dict(size=5, opacity=0.8))
return fig
# Visualize clusters using UMAP
def visualize_clusters_on_umap(data, cluster_column="discovered_clusters"):
"""
Visualizes discovered clusters on a UMAP plot.
Parameters:
- data: AnnData object containing UMAP coordinates and clusters.
- cluster_column: Column name containing cluster information.
Returns:
- fig: Plotly UMAP visualization.
"""
umap_df = pd.DataFrame(data.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
umap_df[cluster_column] = data.obs[cluster_column].values
fig = px.scatter(
umap_df,
x="UMAP1",
y="UMAP2",
color=cluster_column,
title="UMAP with Discovered Clusters",
labels={cluster_column: "Cluster"},
)
fig.update_traces(marker=dict(size=5, opacity=0.8))
return fig
In [ ]:
Copied!
# Visualize gene expression in clusters using a bubble plot
def plot_bubble_matrix(adata, cluster_column="discovered_clusters", top_n_genes=10):
# Get the cluster labels
clusters = adata.obs[cluster_column].unique()
# Extract the marker genes (use the most variable genes or any list you prefer)
marker_genes = adata.var_names[:top_n_genes]
# Collect the gene expression for each cluster
cluster_expression = []
for cluster in clusters:
# Get the subset of cells belonging to the current cluster
cluster_cells = adata[adata.obs[cluster_column] == cluster]
# Get the average expression for each marker gene
mean_expression = cluster_cells[:, marker_genes].X.mean(axis=0) # Make sure it's 2D
cluster_expression.append(mean_expression.A1 if hasattr(mean_expression, "A1") else mean_expression)
# Convert cluster_expression to a 2D numpy array
cluster_expression = np.array(cluster_expression)
# Create a DataFrame for the cluster expression matrix
bubble_df = pd.DataFrame(
cluster_expression,
columns=marker_genes,
index=adata.obs[cluster_column].unique(),
)
bubble_df.index.name = "Cluster"
bubble_df = bubble_df.reset_index().melt(id_vars="Cluster", var_name="Gene", value_name="Expression")
# Plotting the bubble plot
plt.figure(figsize=(12, 8))
scatter = sns.scatterplot(
data=bubble_df,
x="Gene",
y="Cluster",
size="Expression",
hue="Expression",
palette="viridis",
sizes=(50, 500),
marker="o",
)
# Adjust the legend position to the right side
plt.legend(title="Expression", bbox_to_anchor=(1.05, 0.5), loc="center left")
# Title and label adjustments
plt.title("Gene Expression in Clusters")
plt.xticks(rotation=90)
plt.tight_layout() # Ensure the plot fits within the figure space
# Show the plot
plt.show()
# Visualize gene expression in clusters using a bubble plot
def plot_bubble_matrix(adata, cluster_column="discovered_clusters", top_n_genes=10):
# Get the cluster labels
clusters = adata.obs[cluster_column].unique()
# Extract the marker genes (use the most variable genes or any list you prefer)
marker_genes = adata.var_names[:top_n_genes]
# Collect the gene expression for each cluster
cluster_expression = []
for cluster in clusters:
# Get the subset of cells belonging to the current cluster
cluster_cells = adata[adata.obs[cluster_column] == cluster]
# Get the average expression for each marker gene
mean_expression = cluster_cells[:, marker_genes].X.mean(axis=0) # Make sure it's 2D
cluster_expression.append(mean_expression.A1 if hasattr(mean_expression, "A1") else mean_expression)
# Convert cluster_expression to a 2D numpy array
cluster_expression = np.array(cluster_expression)
# Create a DataFrame for the cluster expression matrix
bubble_df = pd.DataFrame(
cluster_expression,
columns=marker_genes,
index=adata.obs[cluster_column].unique(),
)
bubble_df.index.name = "Cluster"
bubble_df = bubble_df.reset_index().melt(id_vars="Cluster", var_name="Gene", value_name="Expression")
# Plotting the bubble plot
plt.figure(figsize=(12, 8))
scatter = sns.scatterplot(
data=bubble_df,
x="Gene",
y="Cluster",
size="Expression",
hue="Expression",
palette="viridis",
sizes=(50, 500),
marker="o",
)
# Adjust the legend position to the right side
plt.legend(title="Expression", bbox_to_anchor=(1.05, 0.5), loc="center left")
# Title and label adjustments
plt.title("Gene Expression in Clusters")
plt.xticks(rotation=90)
plt.tight_layout() # Ensure the plot fits within the figure space
# Show the plot
plt.show()
In [ ]:
Copied!
# Example Usage
if __name__ == "__main__":
# Load example single-cell data
adata = sc.datasets.pbmc3k() # Example PBMC dataset
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# Discover new clusters
adata = discover_new_annotations(adata, clustering_resolution=0.8, n_pcs=50)
# Visualize discovered clusters on UMAP
fig = visualize_clusters_on_umap(adata, cluster_column="discovered_clusters")
fig.show()
# Visualize gene expression in clusters using a bubble plot
plot_bubble_matrix(adata, cluster_column="discovered_clusters", top_n_genes=10)
# Example Usage
if __name__ == "__main__":
# Load example single-cell data
adata = sc.datasets.pbmc3k() # Example PBMC dataset
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# Discover new clusters
adata = discover_new_annotations(adata, clustering_resolution=0.8, n_pcs=50)
# Visualize discovered clusters on UMAP
fig = visualize_clusters_on_umap(adata, cluster_column="discovered_clusters")
fig.show()
# Visualize gene expression in clusters using a bubble plot
plot_bubble_matrix(adata, cluster_column="discovered_clusters", top_n_genes=10)
/Users/jaydeepbhat/Documents/Hackathon/2024_AI_Agent/scripts/talk2cells/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm /var/folders/vn/z_dtm8w13b92pwg7wrgkkfx00000gn/T/ipykernel_56284/3840506039.py:26: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg. To achieve the future defaults please pass: flavor="igraph" and n_iterations=2. directed must also be False to work with igraph's implementation. sc.tl.leiden(data, resolution=clustering_resolution) # Leiden clustering algorithm
Clusters discovered and annotated.