UMAP Visualization
In [ ]:
Copied!
# import libraries
import pandas as pd
import numpy as np
import umap
import plotly.express as px
from sklearn.preprocessing import StandardScaler
# import libraries
import pandas as pd
import numpy as np
import umap
import plotly.express as px
from sklearn.preprocessing import StandardScaler
In [ ]:
Copied!
# Normalize the data
def is_data_normalized(data, tolerance=1e-3):
"""
Check if the data is normalized (mean ~ 0 and std ~ 1 for each column).
Parameters:
data (np.ndarray): Data to check.
tolerance (float): Tolerance level for mean and std deviation.
Returns:
bool: True if data is normalized, False otherwise.
"""
mean_check = np.allclose(np.mean(data, axis=0), 0, atol=tolerance)
std_check = np.allclose(np.std(data, axis=0), 1, atol=tolerance)
return mean_check and std_check
# Normalize the data
def is_data_normalized(data, tolerance=1e-3):
"""
Check if the data is normalized (mean ~ 0 and std ~ 1 for each column).
Parameters:
data (np.ndarray): Data to check.
tolerance (float): Tolerance level for mean and std deviation.
Returns:
bool: True if data is normalized, False otherwise.
"""
mean_check = np.allclose(np.mean(data, axis=0), 0, atol=tolerance)
std_check = np.allclose(np.std(data, axis=0), 1, atol=tolerance)
return mean_check and std_check
In [ ]:
Copied!
# Visualize single-cell data using UMAP
def visualize_umap(data, labels=None, n_neighbors=15, min_dist=0.1, n_components=2, random_state=42):
"""
Visualize single-cell gene expression data on a UMAP plot.
Parameters:
data (pd.DataFrame or np.ndarray): Gene expression data. Rows are cells, columns are features/genes.
labels (pd.Series or np.ndarray, optional): Labels or annotations for the cells (e.g., cell type, tissue).
n_neighbors (int): Number of neighbors for UMAP.
min_dist (float): Minimum distance between points on the UMAP plot.
n_components (int): Number of dimensions for the UMAP embedding (default: 2 for 2D visualization).
random_state (int): Random state for reproducibility.
Returns:
fig (plotly.graph_objects.Figure): Interactive UMAP plot.
"""
# Ensure data is in the right format
if isinstance(data, pd.DataFrame):
data_array = data.values
else:
data_array = data
# Check if data is normalized
if not is_data_normalized(data_array):
print("Data is not normalized. Performing normalization...")
data_scaled = StandardScaler().fit_transform(data_array)
else:
print("Data is already normalized. Skipping normalization...")
data_scaled = data_array
# Apply UMAP
umap_model = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=random_state
)
embedding = umap_model.fit_transform(data_scaled)
# Create a DataFrame for the embedding
umap_df = pd.DataFrame(embedding, columns=[f"UMAP_{i+1}" for i in range(n_components)])
# Add labels if provided
if labels is not None:
umap_df["Labels"] = labels
# Plot using Plotly
if n_components == 2:
fig = px.scatter(
umap_df, x="UMAP_1", y="UMAP_2",
color="Labels" if labels is not None else None,
title="UMAP Visualization of Single-Cell Data",
labels={"Labels": "Annotations"},
hover_data=umap_df.columns
)
elif n_components == 3:
fig = px.scatter_3d(
umap_df, x="UMAP_1", y="UMAP_2", z="UMAP_3",
color="Labels" if labels is not None else None,
title="UMAP Visualization of Single-Cell Data (3D)",
labels={"Labels": "Annotations"},
hover_data=umap_df.columns
)
else:
raise ValueError("Only 2D and 3D visualizations are supported.")
# Customize plot appearance
fig.update_traces(marker=dict(size=5, opacity=0.8), selector=dict(mode="markers"))
fig.update_layout(legend_title="Cell Annotations", legend=dict(itemsizing="constant"))
return fig
# Visualize single-cell data using UMAP
def visualize_umap(data, labels=None, n_neighbors=15, min_dist=0.1, n_components=2, random_state=42):
"""
Visualize single-cell gene expression data on a UMAP plot.
Parameters:
data (pd.DataFrame or np.ndarray): Gene expression data. Rows are cells, columns are features/genes.
labels (pd.Series or np.ndarray, optional): Labels or annotations for the cells (e.g., cell type, tissue).
n_neighbors (int): Number of neighbors for UMAP.
min_dist (float): Minimum distance between points on the UMAP plot.
n_components (int): Number of dimensions for the UMAP embedding (default: 2 for 2D visualization).
random_state (int): Random state for reproducibility.
Returns:
fig (plotly.graph_objects.Figure): Interactive UMAP plot.
"""
# Ensure data is in the right format
if isinstance(data, pd.DataFrame):
data_array = data.values
else:
data_array = data
# Check if data is normalized
if not is_data_normalized(data_array):
print("Data is not normalized. Performing normalization...")
data_scaled = StandardScaler().fit_transform(data_array)
else:
print("Data is already normalized. Skipping normalization...")
data_scaled = data_array
# Apply UMAP
umap_model = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=random_state
)
embedding = umap_model.fit_transform(data_scaled)
# Create a DataFrame for the embedding
umap_df = pd.DataFrame(embedding, columns=[f"UMAP_{i+1}" for i in range(n_components)])
# Add labels if provided
if labels is not None:
umap_df["Labels"] = labels
# Plot using Plotly
if n_components == 2:
fig = px.scatter(
umap_df, x="UMAP_1", y="UMAP_2",
color="Labels" if labels is not None else None,
title="UMAP Visualization of Single-Cell Data",
labels={"Labels": "Annotations"},
hover_data=umap_df.columns
)
elif n_components == 3:
fig = px.scatter_3d(
umap_df, x="UMAP_1", y="UMAP_2", z="UMAP_3",
color="Labels" if labels is not None else None,
title="UMAP Visualization of Single-Cell Data (3D)",
labels={"Labels": "Annotations"},
hover_data=umap_df.columns
)
else:
raise ValueError("Only 2D and 3D visualizations are supported.")
# Customize plot appearance
fig.update_traces(marker=dict(size=5, opacity=0.8), selector=dict(mode="markers"))
fig.update_layout(legend_title="Cell Annotations", legend=dict(itemsizing="constant"))
return fig
In [ ]:
Copied!
# Example usage
if __name__ == "__main__":
# Simulated gene expression data (e.g., 100 cells with 50 genes each)
data = np.random.rand(100, 50)
labels = np.random.choice(["Type A", "Type B", "Type C"], size=100)
# Create UMAP plot
fig = visualize_umap(data, labels=labels, n_components=2)
fig.show()
# Example usage
if __name__ == "__main__":
# Simulated gene expression data (e.g., 100 cells with 50 genes each)
data = np.random.rand(100, 50)
labels = np.random.choice(["Type A", "Type B", "Type C"], size=100)
# Create UMAP plot
fig = visualize_umap(data, labels=labels, n_components=2)
fig.show()
/Users/jaydeepbhat/Documents/Hackathon/2024_AI_Agent/scripts/talk2cells/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Data is not normalized. Performing normalization...
/Users/jaydeepbhat/Documents/Hackathon/2024_AI_Agent/scripts/talk2cells/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(