Running GDR on MNIST image data#

[1]:
path = '/home/mid166/Analysis/Jupyter/Python/Package/PIASO_github'
import sys
sys.path.append(path)
import piaso
/n/data1/hms/neurobio/fishell/mindai/.conda/envs/scda5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
[2]:
import scanpy as sc
[3]:
import pandas as pd
import numpy as np
[4]:
sc.set_figure_params(dpi=80,dpi_save=300, color_map='viridis',facecolor='white')
from matplotlib import rcParams
rcParams['figure.figsize'] = 4, 4
save_dir='/n/data1/hms/neurobio/fishell/mindai/Result/single-cell/Methods/Emergene'
sc.settings.figdir = save_dir
prefix='GitHub_testing'

Load the MNIST dataset#

[5]:
from sklearn.datasets import fetch_openml
[6]:
mnist = fetch_openml("mnist_784", version=1)
[7]:
mnist.data
[7]:
pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10 ... pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783 pixel784
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
69995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
69996 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
69997 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
69998 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
69999 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

70000 rows × 784 columns

[8]:
mnist.target
[8]:
0        5
1        0
2        4
3        1
4        9
        ..
69995    2
69996    3
69997    4
69998    5
69999    6
Name: class, Length: 70000, dtype: category
Categories (10, object): ['0', '1', '2', '3', ..., '6', '7', '8', '9']
[9]:
adata=sc.AnnData(mnist.data)
/n/data1/hms/neurobio/fishell/mindai/.conda/envs/scda5/lib/python3.10/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
[10]:
adata.obs.head()
[10]:
0
1
2
3
4
[12]:
from scipy import sparse
[13]:
adata.X=sparse.csr_matrix(adata.X)

INFOG Normalization#

[14]:
adata.layers['raw']=adata.X.copy()
[15]:
%%time
piaso.tl.infog(adata, layer='raw', n_top_genes=100)
The normalized data is saved as `infog` in `adata.layers`.
The highly variable genes are saved as `highly_variable` in `adata.var`.
Finished INFOG normalization.
CPU times: user 609 ms, sys: 246 ms, total: 855 ms
Wall time: 856 ms

Setting use_highly_variable=False is needed for this dataset:

[16]:
%%time
piaso.tl.runGDRParallel(
    adata,
    groupby=None,
    resolution=1,
    n_gene =100,
    mu = 10,
    layer='raw', infog_layer='raw',
    score_layer='raw',
    scoring_method='piaso',

    random_seed=1927,

    use_highly_variable =False,
    n_highly_variable_genes = 200,

    n_svd_dims =10,

    max_workers = 32,
    calculate_score_multiBatch=False,
    verbosity=0
)
The cell embeddings calculated by GDR were saved as `X_gdr` in adata.obsm.
CPU times: user 1min 55s, sys: 984 ms, total: 1min 56s
Wall time: 2min 5s
[17]:
%%time
sc.pp.neighbors(adata,
                use_rep='X_gdr',
               n_neighbors=15,random_state=10,knn=True,
                method="umap")

sc.tl.umap(adata)
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm)
    'umap', UMAP parameters (adata.uns) (0:00:32)
CPU times: user 50 s, sys: 161 ms, total: 50.1 s
Wall time: 39.2 s
[18]:
adata.obs['class']=mnist.target.values
[19]:
sc.pl.umap(adata,
           color=['class'],
           # layer='raw',
           palette=piaso.pl.color.d_color1,
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_21_0.png
[20]:
import matplotlib.pyplot as plt
import numpy as np


def visualize_mnist_image(image_data):
    # Reshape from 784 pixels to 28x28 image
    image = image_data.reshape(28, 28)

    # Display the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image, cmap='gray')
    plt.axis('off')
    plt.title('MNIST Digit')
    plt.show()
[21]:
visualize_mnist_image(adata[2].X.toarray())
../_images/notebooks_PIASO_GDR_on_MNIST_23_0.png
[22]:
visualize_mnist_image(adata[2].layers['infog'].toarray())
../_images/notebooks_PIASO_GDR_on_MNIST_24_0.png
[23]:
visualize_mnist_image(adata[2].layers['raw'].toarray())
../_images/notebooks_PIASO_GDR_on_MNIST_25_0.png
[24]:
%%time
sc.tl.leiden(adata,resolution=1,key_added='Leiden')
running Leiden clustering
<timed eval>:1: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.

 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
    finished: found 21 clusters and added
    'Leiden', the cluster labels (adata.obs, categorical) (0:01:09)
CPU times: user 1min 9s, sys: 224 ms, total: 1min 9s
Wall time: 1min 9s
[25]:
sc.pl.umap(adata,
           color=['Leiden'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_27_0.png
[26]:
sc.pl.umap(adata,
           color=['Leiden'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           legend_fontsize=7,
           legend_fontoutline=2,
           legend_loc='on data',
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_28_0.png
[ ]:

[27]:
sc.pl.umap(adata,
           color=['class'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           legend_fontsize=7,
           legend_fontoutline=2,
           legend_loc='on data',
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_30_0.png
[28]:
def visualize_cluster(X, cluster_labels, cluster_id, num_samples=7):
    # Find all samples belonging to the specified cluster
    indices = np.where(cluster_labels == cluster_id)[0]

    # Select random samples from this cluster (or first few if less than requested)
    sample_indices = indices[:min(num_samples, len(indices))]

    # Create a figure with subplots
    fig, axes = plt.subplots(1, len(sample_indices), figsize=(len(sample_indices)*2, 2))

    # Display each sample
    for i, idx in enumerate(sample_indices):
        ax = axes[i] if len(sample_indices) > 1 else axes
        ax.imshow(X[idx].reshape(28, 28), cmap='gray')
        ax.set_title(f'Cluster {cluster_id}')
        ax.axis('off')

    plt.tight_layout()
    plt.show()
[34]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='1',
                 num_samples=7)
../_images/notebooks_PIASO_GDR_on_MNIST_32_0.png
[35]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='2')
../_images/notebooks_PIASO_GDR_on_MNIST_33_0.png
[36]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='4')
../_images/notebooks_PIASO_GDR_on_MNIST_34_0.png

In a supervised way#

[41]:
%%time
piaso.tl.runGDRParallel(
    adata,
    groupby='class',
    resolution=2.0,
    n_gene =100,
    mu = 10,


    layer='raw', infog_layer='raw',
    score_layer='raw',
    scoring_method='piaso',

    random_seed=1927,


    use_highly_variable =False,
    n_highly_variable_genes = 200,

    n_svd_dims =10,

    max_workers = 32,
    calculate_score_multiBatch=False,
    verbosity=0
)
The cell embeddings calculated by GDR were saved as `X_gdr` in adata.obsm.
CPU times: user 474 ms, sys: 246 ms, total: 720 ms
Wall time: 20.5 s
[42]:
adata.obsm['X_umap_gdr']=adata.obsm['X_umap'].copy()
[43]:
%%time
sc.pp.neighbors(adata,
                use_rep='X_gdr',
               n_neighbors=15,random_state=10,knn=True,
                method="umap")

sc.tl.umap(adata)
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm)
    'umap', UMAP parameters (adata.uns) (0:00:31)
CPU times: user 48.7 s, sys: 113 ms, total: 48.8 s
Wall time: 37.9 s
[44]:
sc.pl.umap(adata,
           color=['class'],
           # layer='raw',
           palette=piaso.pl.color.d_color1,
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_39_0.png

Run Clustering#

[45]:
%%time
sc.tl.leiden(adata,resolution=1,key_added='Leiden')
running Leiden clustering
    finished: found 19 clusters and added
    'Leiden', the cluster labels (adata.obs, categorical) (0:00:58)
CPU times: user 58.3 s, sys: 189 ms, total: 58.4 s
Wall time: 58.6 s
[46]:
sc.pl.umap(adata,
           color=['Leiden'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_42_0.png
[47]:
sc.pl.umap(adata,
           color=['Leiden'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           legend_fontsize=7,
           legend_fontoutline=2,
           legend_loc='on data',
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_43_0.png
[48]:
sc.pl.umap(adata,
           color=['class'],
           # layer='raw',
           palette=piaso.pl.color.d_color3,
           legend_fontsize=7,
           legend_fontoutline=2,
           legend_loc='on data',
           # cmap=pos_cmap,
           ncols=3,
           # size=10,
           frameon=False)
../_images/notebooks_PIASO_GDR_on_MNIST_44_0.png
[50]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='1',
                 num_samples=7)
../_images/notebooks_PIASO_GDR_on_MNIST_45_0.png
[51]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='2')
../_images/notebooks_PIASO_GDR_on_MNIST_46_0.png
[53]:
visualize_cluster(adata.X.todense(),
                  cluster_labels=adata.obs['Leiden'],
                  cluster_id='12')
../_images/notebooks_PIASO_GDR_on_MNIST_47_0.png
[ ]: