[ ]:

# 参照：https://github.com/almaan/sepal/blob/master/examples/melanoma.ipynb
# paper：https://academic.oup.com/bioinformatics/article/37/17/2644/6168120

SpatiallyVariableGene_Sepal_MOB

This tutorial demonstrates how to (find) spatially variable gene on MOB data using SODB and Sepal.

A reference paper can be found at https://academic.oup.com/bioinformatics/article/37/17/2644/6168120.

This tutorial refers to the following tutorial at https://github.com/almaan/sepal/blob/master/examples/melanoma.ipynb. At the same time, the way of loadding data is modified by using SODB.

[1]:

# Imports
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sepal
import sepal.datasets as d
import sepal.models as m
import sepal.utils as ut
import sepal.family as family
import sepal.enrich as fea

# 这里会报错说很多包没装，pip install xxx即可
# 报错ImportError: cannot import name 'GProfiler' from 'gprofiler' (/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/gprofiler/__init__.py)
# 把 ~/anaconda3/envs/pysodb/lib/python3.8/site-packages/sepal-1.0.0-py3.8.egg/sepal/enrich.py 中
# from gprofiler import GProfiler
# 改为
# from gprofiler import gprofiler as GProfiler
# 参考:https://github.com/vals/python-gprofiler/issues/9

[19]:

# Initialization
import pysodb

[20]:

# Initialization
sodb = pysodb.SODB()

[23]:

# Define the name of the dataset_name and experiment_name
dataset_name = 'stahl2016visualization'
experiment_name = 'Rep4_MOB_trans'
# Load a specific experiment
# It takes two arguments: the name of the dataset and the name of the experiment to load.
# Two arguments are available at https://gene.ai.tencent.com/SpatialOmics/.
adata = sodb.load_experiment(dataset_name,experiment_name)

load experiment[Rep4_MOB_trans] in dataset[stahl2016visualization]

[24]:

# saves the AnnData object to an H5AD file format.
adata.write_h5ad('MOB_pysodb.h5ad')

[ ]:

# load in the raw data using a RawData class.
raw_data = d.RawData('MOB_pysodb.h5ad')

[ ]:

# filter genes observed in less than 5 spots and/or less than 10 total observations
raw_data.cnt = ut.filter_genes(raw_data.cnt,
                               min_expr=10,
                               min_occur=5)

[ ]:

# Derivative of CountData to hold ST1k array based data using a ST1K class
data = m.ST1K(raw_data,
              eps = 0.1)

[37]:

data.cnt.shape

[37]:

(264, 10869)

[38]:

# A propagate class is employ to normalize count data and then propagate it in time, to measure the diffusion time.
# set scale = True to perform
# minmax scaling of the diffusion times

times = m.propagate(data,
                    normalize = True,
                    scale =True)

[INFO] : Using 80 workers
[INFO] : Saturated Spots : 199

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10869/10869 [00:20<00:00, 525.17it/s]

[39]:

# inspect the top 20 highest ranked ones after computing diffusion times  for all the profiles

n_top = 20
# get names of profiles
sorted_indices = np.argsort(times.values.flatten())
# argsort sorts from low to high, so reverse the order
sorted_indices = sorted_indices[::-1]
sorted_profiles = times.index.values[sorted_indices]
# extract top profiles
top_profiles = sorted_profiles[0:n_top]
# display top profiles and their times
times.loc[top_profiles,:]

[39]:

	average
Rbfox1	1.000000
Gpsm1	0.832367
Prkca	0.806763
Penk	0.796135
Tyro3	0.786957
Rbfox3	0.763285
Pcp4	0.733333
Cacng3	0.733333
Omp	0.732850
Kcnh3	0.723188
Grin1	0.694686
Nrgn	0.671981
Agap2	0.658937
S100a5	0.654589
Tshz1	0.647343
Cpne4	0.644444
Map2k1	0.643961
Camk4	0.642512
Gria3	0.623188
Sez6	0.602899

[40]:

# inspect these visually by using the "plot_profiles function

pltargs = dict(s = 60,
                cmap = "magma",
                edgecolor = 'none',
                marker = 'o',
                )

# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,top_profiles],
                          crd = data.real_crd,
                          rank_values = times.loc[top_profiles,:].values.flatten(),
                          pltargs = pltargs,
                         )
plt.show()

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_14_0.png

[41]:

# As a subsequent step, sort these profiles into pattern families

# Use the top 100 profiles to build the families
n_build = 300

# Use the "get_families" function to group the profiles
family_labels,repr_patterns = family.get_families(data.cnt.loc[:,sorted_profiles].values,
                                            n_base = n_build,
                                            n_sort = n_build,
                                            threshold = 0.80,
                                            )
# familiy label is am array of family indices for each of the profiles.
# Convert familiy label to a data frame to make it easier to work with
families = pd.DataFrame(family_labels,
                        index = sorted_profiles[0:n_build],
                        columns = ['family'],
                        )

[INFO] : Using 4 eigenpatterns
[INFO] : Identified 4 families

/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
  warnings.warn(

[42]:

# As displayed, 4 families were found.
# To get an idea of what type what type of spatial patterns that each family consists of
# plot their respective representative motifs

# use the "plot_representative" function for this purpose
fig, ax = family.plot_representative(motifs=repr_patterns,
                                 crd = data.real_crd,
                                 ncols = 3,
                                 pltargs=pltargs)

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_16_0.png

[43]:

# the motifs differ to some extent
# inspect all the members of each family

# use the "plot_family" function for this
family_plots= family.plot_families(data.cnt.loc[:,sorted_profiles[0:n_build]].values,
                           genes = sorted_profiles[0:n_build],
                           crd = data.real_crd,
                           labels = family_labels,
                           ncols = 10,
                           pltargs = pltargs,
                           side_size = 300,
                           )

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_17_0.png

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_17_1.png

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_17_2.png

../_images/Test_the_original_data_SpatiallyVariableGene_Sepal_MOB_17_3.png