[ ]:
# 参照:https://github.com/almaan/sepal/blob/master/examples/melanoma.ipynb
# paper:https://academic.oup.com/bioinformatics/article/37/17/2644/6168120
SpatiallyVariableGene_Sepal_MOB
This tutorial demonstrates how to (find) spatially variable gene on MOB data using SODB and Sepal.
A reference paper can be found at https://academic.oup.com/bioinformatics/article/37/17/2644/6168120.
This tutorial refers to the following tutorial at https://github.com/almaan/sepal/blob/master/examples/melanoma.ipynb. At the same time, the way of loadding data is modified by using SODB.
[1]:
# Imports
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sepal
import sepal.datasets as d
import sepal.models as m
import sepal.utils as ut
import sepal.family as family
import sepal.enrich as fea
# 这里会报错说很多包没装,pip install xxx即可
# 报错ImportError: cannot import name 'GProfiler' from 'gprofiler' (/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/gprofiler/__init__.py)
# 把 ~/anaconda3/envs/pysodb/lib/python3.8/site-packages/sepal-1.0.0-py3.8.egg/sepal/enrich.py 中
# from gprofiler import GProfiler
# 改为
# from gprofiler import gprofiler as GProfiler
# 参考:https://github.com/vals/python-gprofiler/issues/9
[19]:
# Initialization
import pysodb
[20]:
# Initialization
sodb = pysodb.SODB()
[23]:
# Define the name of the dataset_name and experiment_name
dataset_name = 'stahl2016visualization'
experiment_name = 'Rep4_MOB_trans'
# Load a specific experiment
# It takes two arguments: the name of the dataset and the name of the experiment to load.
# Two arguments are available at https://gene.ai.tencent.com/SpatialOmics/.
adata = sodb.load_experiment(dataset_name,experiment_name)
load experiment[Rep4_MOB_trans] in dataset[stahl2016visualization]
[24]:
# saves the AnnData object to an H5AD file format.
adata.write_h5ad('MOB_pysodb.h5ad')
[ ]:
# load in the raw data using a RawData class.
raw_data = d.RawData('MOB_pysodb.h5ad')
[ ]:
# filter genes observed in less than 5 spots and/or less than 10 total observations
raw_data.cnt = ut.filter_genes(raw_data.cnt,
min_expr=10,
min_occur=5)
[ ]:
# Derivative of CountData to hold ST1k array based data using a ST1K class
data = m.ST1K(raw_data,
eps = 0.1)
[37]:
data.cnt.shape
[37]:
(264, 10869)
[38]:
# A propagate class is employ to normalize count data and then propagate it in time, to measure the diffusion time.
# set scale = True to perform
# minmax scaling of the diffusion times
times = m.propagate(data,
normalize = True,
scale =True)
[INFO] : Using 80 workers
[INFO] : Saturated Spots : 199
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10869/10869 [00:20<00:00, 525.17it/s]
[39]:
# inspect the top 20 highest ranked ones after computing diffusion times for all the profiles
n_top = 20
# get names of profiles
sorted_indices = np.argsort(times.values.flatten())
# argsort sorts from low to high, so reverse the order
sorted_indices = sorted_indices[::-1]
sorted_profiles = times.index.values[sorted_indices]
# extract top profiles
top_profiles = sorted_profiles[0:n_top]
# display top profiles and their times
times.loc[top_profiles,:]
[39]:
average | |
---|---|
Rbfox1 | 1.000000 |
Gpsm1 | 0.832367 |
Prkca | 0.806763 |
Penk | 0.796135 |
Tyro3 | 0.786957 |
Rbfox3 | 0.763285 |
Pcp4 | 0.733333 |
Cacng3 | 0.733333 |
Omp | 0.732850 |
Kcnh3 | 0.723188 |
Grin1 | 0.694686 |
Nrgn | 0.671981 |
Agap2 | 0.658937 |
S100a5 | 0.654589 |
Tshz1 | 0.647343 |
Cpne4 | 0.644444 |
Map2k1 | 0.643961 |
Camk4 | 0.642512 |
Gria3 | 0.623188 |
Sez6 | 0.602899 |
[40]:
# inspect these visually by using the "plot_profiles function
pltargs = dict(s = 60,
cmap = "magma",
edgecolor = 'none',
marker = 'o',
)
# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,top_profiles],
crd = data.real_crd,
rank_values = times.loc[top_profiles,:].values.flatten(),
pltargs = pltargs,
)
plt.show()
[41]:
# As a subsequent step, sort these profiles into pattern families
# Use the top 100 profiles to build the families
n_build = 300
# Use the "get_families" function to group the profiles
family_labels,repr_patterns = family.get_families(data.cnt.loc[:,sorted_profiles].values,
n_base = n_build,
n_sort = n_build,
threshold = 0.80,
)
# familiy label is am array of family indices for each of the profiles.
# Convert familiy label to a data frame to make it easier to work with
families = pd.DataFrame(family_labels,
index = sorted_profiles[0:n_build],
columns = ['family'],
)
[INFO] : Using 4 eigenpatterns
[INFO] : Identified 4 families
/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
warnings.warn(
[42]:
# As displayed, 4 families were found.
# To get an idea of what type what type of spatial patterns that each family consists of
# plot their respective representative motifs
# use the "plot_representative" function for this purpose
fig, ax = family.plot_representative(motifs=repr_patterns,
crd = data.real_crd,
ncols = 3,
pltargs=pltargs)
[43]:
# the motifs differ to some extent
# inspect all the members of each family
# use the "plot_family" function for this
family_plots= family.plot_families(data.cnt.loc[:,sorted_profiles[0:n_build]].values,
genes = sorted_profiles[0:n_build],
crd = data.real_crd,
labels = family_labels,
ncols = 10,
pltargs = pltargs,
side_size = 300,
)