Deconvoluyion_Tangram_ST_PDAC

This tutorial demonstrates deconvolution on ST PDAC data using SODB and TANGRAM.

[ ]:

# import several Python libraries, including:
# scanpy: a Python package for single-cell RNA sequencing analysis.
# squidpy: a Python package for spatial transcriptomics analysis.
# numpy: a Python package for scientific computing with arrays.
# pandas: a Python package for data manipulation and analysis.
# anndata: a Python package for handling annotated data objects in genomics.
# pathlib: a Python module for working with file system paths.
# matplotlib: a Python plotting library.
# skimage: a Python package for image processing.
import scanpy as sc
import squidpy as sq
import numpy as np
import pandas as pd
import anndata as ad
from anndata import AnnData
import pathlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import skimage

[ ]:

# import tangram for spatial deconvolution
import tangram as tg

[86]:

# print a header message, and the version of the squidpy and tangram packages
sc.logging.print_header()
print(f"squidpy=={sq.__version__}")
print(f"tangram=={tg.__version__}")

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.22.4 scipy==1.9.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.5 python-igraph==0.10.2 pynndescent==0.5.8
squidpy==1.2.3
tangram==1.0.3

[87]:

# Use read_csv to load "pd_sc" and "pd_sc_meta"
pd_sc = pd.read_csv('data/pdac/sc_data.csv')
pd_sc_meta = pd.read_csv('data/pdac/sc_meta.csv')

[88]:

# set index using the 'Unnamed: 0' column for "pd_sc"
pd_sc = pd_sc.set_index('Unnamed: 0')

[89]:

# set index using the 'Cell' column for "pd_sc_meta"
pd_sc_meta = pd_sc_meta.set_index('Cell')

[90]:

#converte pandas's dataframes into numpy arrays
sc_genes = np.array(pd_sc.index)
sc_obs = np.array(pd_sc.columns)
# transposed array
sc_X = np.array(pd_sc.values).transpose()

[91]:

# initialize an AnnData object using "sc_X", "sc_genes" and "sc_obs"
adata_sc = ad.AnnData(sc_X)
adata_sc.var_names = sc_genes
adata_sc.obs_names = sc_obs
# assign the cell type labels to "adata_sc.obs['CellType']"
adata_sc.obs['CellType'] = pd_sc_meta['Cell_type'].values

[92]:

# prints out the metadata of adata_sc
adata_sc

[92]:

AnnData object with n_obs × n_vars = 1926 × 19104
    obs: 'CellType'

[ ]:

# Import the pysodb library
# pysodb is a Python package that provides a set of tools for working with SODB (Store On-Demand Block) databases.
# SODB is a format used to store data in memory-mapped files for efficient access and querying.
# This library allows users to interact with SODB files using Python.
import pysodb

[93]:

# Initialization
sodb = pysodb.SODB()

[94]:

# Define the name of the dataset_name and experiment_name
dataset_name = 'moncada2020integrating'
experiment_name = 'GSM3036911_spatial_transcriptomics'
# Load a specific experiment
# It takes two arguments: the name of the dataset and the name of the experiment to load.
# Two arguments are available at https://gene.ai.tencent.com/SpatialOmics/.
adata_st = sodb.load_experiment(dataset_name,experiment_name)

load experiment[GSM3036911_spatial_transcriptomics] in dataset[moncada2020integrating]

[95]:

# perform differential gene expression analysis across 'CellType' in 'adata_sc'
sc.tl.rank_genes_groups(adata_sc, groupby="CellType", use_raw=False)

WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'

[96]:

# creates a Pandas DataFrame called "markers_df" by extracting the top 100 differentially expressed genes from 'adata_sc'
markers_df = pd.DataFrame(adata_sc.uns["rank_genes_groups"]["names"]).iloc[0:100, :]
# creates a NumPy array called "genes_sc" by extracting the unique values from the "value" column of a melted version of the "markers_df"
genes_sc = np.unique(markers_df.melt().value.values)
# extracte the names of genes from "adata_st"
genes_st = adata_st.var_names.values
# creates a Python list called "genes"
# contain the intersection of genes identified as differentially expressed in  "genes_sc" and genes detected in "genes_st".
genes = list(set(genes_sc).intersection(set(genes_st)))
# the length of "genes"
len(genes)

[96]:

[97]:

# use the Tangram to align the gene expression profiles of "adata_sc" and "adata_st" based on the shared set of genes identified by the intersection of "genes_sc" and "genes_st".
tg.pp_adatas(adata_sc, adata_st, genes=genes)

INFO:root:1098 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:13775 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.

[98]:

# use the map_cells_to_space function from the tangram to map cells from "adata_sc")" onto "adata_st".
# The mapping use "cells" mode, which assign each cell from adata_sc to a location within the spatial transcriptomics space based on its gene expression profile.
ad_map = tg.map_cells_to_space(
    adata_sc,
    adata_st,
    mode="cells",
    # target_count=adata_st.obs.cell_count.sum(),
    # density_prior=np.array(adata_st.obs.cell_count) / adata_st.obs.cell_count.sum(),
    num_epochs=1000,
    device="cpu",
)

INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 1098 genes and rna_count_based density_prior in cells mode...
INFO:root:Printing scores every 100 epochs.

Score: 0.293, KL reg: 0.030
Score: 0.548, KL reg: 0.001
Score: 0.550, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001
Score: 0.551, KL reg: 0.001

INFO:root:Saving results..

[99]:

# project "Cell_subclass" annotations from a single-cell RNA sequencing (scRNA-seq) dataset onto a spatial transcriptomics dataset,
# based on a previously computed cell-to-space mapping
tg.project_cell_annotations(ad_map, adata_st, annotation="CellType")

INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.

[100]:

# creates new columns in "adata_st.obs" that correspond to the values in "adata_st.obsm['tangram_ct_pred']"
for ct in adata_st.obsm['tangram_ct_pred'].columns:
    adata_st.obs[ct] = np.array(adata_st.obsm['tangram_ct_pred'][ct].values)

[101]:

# print adata_st.obsm['tangram_ct_pred']
adata_st.obsm['tangram_ct_pred']

[101]:

	Acinar cells	Ductal	Cancer clone A	Cancer clone B	mDCs	Tuft cells	pDCs	Endocrine cells	Endothelial cells	Macrophages	Mast cells	T cells NK cells	Monocytes	RBCs	Fibroblasts
spots
10x10	0.006334	4.706014	0.117727	0.399284	0.497135	0.013638	0.054890	0.005253	0.094044	0.335560	0.052560	0.180580	0.055085	0.026505	0.036953
10x13	0.002069	4.792149	0.280019	0.075865	0.118639	0.065541	0.024328	0.005411	0.016519	0.057996	0.038706	0.095902	0.049501	0.010803	0.005225
10x14	0.013044	4.487615	0.054121	0.310157	0.158647	0.017234	0.088165	0.003886	0.034732	0.128785	0.092184	0.057863	0.000053	0.011522	0.003363
10x15	0.033054	3.723423	0.066192	0.118146	0.136094	0.064683	0.061699	0.004048	0.052694	0.097385	0.036767	0.059832	0.107271	0.021100	0.003672
10x16	0.007866	3.579839	0.033015	0.026313	0.000054	0.088877	0.067944	0.004586	0.007438	0.026395	0.045274	0.282783	0.015714	0.003617	0.008149
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9x29	0.003962	4.499743	0.000111	0.424417	0.190720	0.011316	0.017832	0.005410	0.057422	0.028645	0.030519	0.149691	0.000129	0.005335	0.000061
9x30	0.003030	2.804563	0.036787	0.482187	0.109475	0.072179	0.017910	0.005255	0.017748	0.058597	0.117022	0.168436	0.080849	0.002608	0.008962
9x31	0.008780	1.433432	0.342950	0.297287	0.107831	0.017091	0.072641	0.004113	0.008162	0.380184	0.000050	0.169367	0.021958	0.003261	0.005781
9x32	0.055780	2.076649	0.023450	0.000402	0.047358	0.033798	0.000417	0.008584	0.029910	0.075386	0.031452	0.083817	0.020545	0.003629	0.002936
9x33	0.009425	1.586668	0.002164	0.133457	0.027596	0.036530	0.015639	0.005100	0.016363	0.029191	0.021844	0.041210	0.029885	0.020056	0.000254

428 rows × 15 columns

[102]:

# create a spatial scatter plot showing the distribution of different cell types
sc.pl.embedding(
    adata_st,
    basis='spatial',
    color=['Acinar cells','Cancer clone A','Cancer clone B','Ductal'],
    # color='leiden'

)

../_images/Test_the_new_data_Deconvoluyion_Tangram_ST_PDAC_21_0.png