SpatiallyVariableGene_Sepal_Visium
[1]:
# Imports
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sepal
import sepal.datasets as d
import sepal.models as m
import sepal.utils as ut
import sepal.family as family
import sepal.enrich as fea
# pip install xxx
# ImportError: cannot import name 'GProfiler' from 'gprofiler' (/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/gprofiler/__init__.py)
# 把 ~/anaconda3/envs/pysodb/lib/python3.8/site-packages/sepal-1.0.0-py3.8.egg/sepal/enrich.py 中
# from gprofiler import GProfiler
# to
# from gprofiler import gprofiler as GProfiler
# refer:https://github.com/vals/python-gprofiler/issues/9
[2]:
import pysodb
[3]:
sodb = pysodb.SODB()
[15]:
dataset_name = '10x'
experiment_name = 'V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix'
adata = sodb.load_experiment(dataset_name,experiment_name)
download experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185M/185M [00:15<00:00, 12.9MB/s]
load experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]
[16]:
adata.write_h5ad('Visium_pysodb.h5ad')
[17]:
# Set path to melanoma sample
# pth = "/home/yzy/software/sepal/data/real/melanoma.tsv"
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"
# load in the raw data using the RawData class
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"
# raw_data = d.RawData(pth)
raw_data = d.RawData('Visium_pysodb.h5ad')
# filter genes observed in less than 5 spots
# and/or less than 10 total observations
raw_data.cnt = ut.filter_genes(raw_data.cnt,
min_expr=10,
min_occur=5)
# load into our our count data object
# this is a ST1K array, hence
# we use the ST1K class
data = m.VisiumData(raw_data,
eps = 0.1)
[18]:
# We will normalize our count data
# and then propagate it in time, to measure
# the diffusion time.
# We set scale = True to perform
# minmax scaling of the diffusion times
# Running the simulation on 8 cores
# takes about ~2 minutes
times = m.propagate(data,
normalize = True,
scale =True)
[INFO] : Using 80 workers
[INFO] : Saturated Spots : 3095
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16278/16278 [01:21<00:00, 199.59it/s]
[26]:
# Now, once the diffusion times
# have been computed for all the
# profiles, we can inspect the
# top 20 highest ranked ones
n_top = 20
# get names of profiles
sorted_indices = np.argsort(times.values.flatten())
# argsort sorts from low to high, so reverse the order
sorted_indices = sorted_indices[::-1]
sorted_profiles = times.index.values[sorted_indices]
# extract top profiles
top_profiles = sorted_profiles[0:n_top]
tail_profiles = sorted_profiles[-n_top:-1]
# display top profiles and their times
times.loc[top_profiles,:]
[26]:
average | |
---|---|
Calb1 | 1.000000 |
Prkcg | 0.960986 |
Gfap | 0.946612 |
Apod | 0.919918 |
Hpca | 0.919918 |
Sst | 0.915811 |
Car8 | 0.899384 |
Itpka | 0.893224 |
Mgp | 0.891170 |
Itpr1 | 0.887064 |
Dner | 0.874743 |
Hbb-bt | 0.872690 |
Gad1 | 0.862423 |
Igfbp2 | 0.856263 |
Vim | 0.850103 |
Pcp4 | 0.841889 |
Nefm | 0.835729 |
Gria1 | 0.833676 |
Fam107a | 0.825462 |
Hba-a2 | 0.821355 |
[28]:
# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose
# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib
pltargs = dict(s = 20,
cmap = "magma",
edgecolor = 'none',
marker = 'o',
)
# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,top_profiles],
crd = data.real_crd,
rank_values = times.loc[top_profiles,:].values.flatten(),
pltargs = pltargs,
)
plt.show()
[29]:
# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose
# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib
pltargs = dict(s = 20,
cmap = "magma",
edgecolor = 'none',
marker = 'o',
)
# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,tail_profiles],
crd = data.real_crd,
rank_values = times.loc[tail_profiles,:].values.flatten(),
pltargs = pltargs,
)
plt.show()
[32]:
# Let us, as a subsequent step, sort
# these profiles into pattern families
# we will use the top 100 profiles
# to build our families
n_build = 5000
# use the "get_families" function
# to group the profiles
family_labels,repr_patterns = family.get_families(data.cnt.loc[:,sorted_profiles].values,
n_base = n_build,
n_sort = n_build,
threshold = 0.80,
)
# familiy label is am array of family indices
# for each of the profiles. We convert
# this to a data frame to make it easier to work with
families = pd.DataFrame(family_labels,
index = sorted_profiles[0:n_build],
columns = ['family'],
)
[INFO] : Using 2 eigenpatterns
[INFO] : Identified 2 families
/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
warnings.warn(
[ ]:
# As displayed, 4 families were found. To get
# an idea of what type what type of spatial
# patterns that each family consists of
# we will plot their respective representative motifs
# use the "plot_representative" function
# for this purpose
fig, ax = family.plot_representative(motifs=repr_patterns,
crd = data.real_crd,
ncols = 3,
pltargs=pltargs)
[43]:
# having done so, we se that the motifs differ to some extent
# and we may inspect all the members of each
# family
# use the "plot_family" function for this
family_plots= family.plot_families(data.cnt.loc[:,sorted_profiles[0:n_build]].values,
genes = sorted_profiles[0:n_build],
crd = data.real_crd,
labels = family_labels,
ncols = 10,
pltargs = pltargs,
side_size = 300,
)
[ ]: