SpatiallyVariableGene_Sepal_Visium

[1]:
# Imports
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sepal
import sepal.datasets as d
import sepal.models as m
import sepal.utils as ut
import sepal.family as family
import sepal.enrich as fea

# pip install xxx
# ImportError: cannot import name 'GProfiler' from 'gprofiler' (/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/gprofiler/__init__.py)
# 把 ~/anaconda3/envs/pysodb/lib/python3.8/site-packages/sepal-1.0.0-py3.8.egg/sepal/enrich.py 中
# from gprofiler import GProfiler
# to
# from gprofiler import gprofiler as GProfiler
# refer:https://github.com/vals/python-gprofiler/issues/9


[2]:
import pysodb
[3]:
sodb = pysodb.SODB()
[15]:
dataset_name = '10x'
experiment_name = 'V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix'
adata = sodb.load_experiment(dataset_name,experiment_name)
download experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185M/185M [00:15<00:00, 12.9MB/s]
load experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]
[16]:
adata.write_h5ad('Visium_pysodb.h5ad')
[17]:
# Set path to melanoma sample
# pth = "/home/yzy/software/sepal/data/real/melanoma.tsv"
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"


# load in the raw data using the RawData class
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"
# raw_data = d.RawData(pth)



raw_data = d.RawData('Visium_pysodb.h5ad')


# filter genes observed in less than 5 spots
# and/or less than 10 total observations

raw_data.cnt = ut.filter_genes(raw_data.cnt,
                               min_expr=10,
                               min_occur=5)
# load into our our count data object
# this is a ST1K array, hence
# we use the ST1K class


data = m.VisiumData(raw_data,
              eps = 0.1)
[18]:
# We will normalize our count data
# and then propagate it in time, to measure
# the diffusion time.
# We set scale = True to perform
# minmax scaling of the diffusion times

# Running the simulation on 8 cores
# takes about ~2 minutes
times = m.propagate(data,
                    normalize = True,
                    scale =True)
[INFO] : Using 80 workers
[INFO] : Saturated Spots : 3095
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16278/16278 [01:21<00:00, 199.59it/s]
[26]:
# Now, once the diffusion times
# have been computed for all the
# profiles, we can inspect the
# top 20 highest ranked ones

n_top = 20
# get names of profiles
sorted_indices = np.argsort(times.values.flatten())
# argsort sorts from low to high, so reverse the order
sorted_indices = sorted_indices[::-1]
sorted_profiles = times.index.values[sorted_indices]
# extract top profiles
top_profiles = sorted_profiles[0:n_top]
tail_profiles = sorted_profiles[-n_top:-1]

# display top profiles and their times
times.loc[top_profiles,:]
[26]:
average
Calb1 1.000000
Prkcg 0.960986
Gfap 0.946612
Apod 0.919918
Hpca 0.919918
Sst 0.915811
Car8 0.899384
Itpka 0.893224
Mgp 0.891170
Itpr1 0.887064
Dner 0.874743
Hbb-bt 0.872690
Gad1 0.862423
Igfbp2 0.856263
Vim 0.850103
Pcp4 0.841889
Nefm 0.835729
Gria1 0.833676
Fam107a 0.825462
Hba-a2 0.821355
[28]:
# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose


# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib

pltargs = dict(s = 20,
                cmap = "magma",
                edgecolor = 'none',
                marker = 'o',
                )

# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,top_profiles],
                          crd = data.real_crd,
                          rank_values = times.loc[top_profiles,:].values.flatten(),
                          pltargs = pltargs,
                         )
plt.show()
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_9_0.png
[29]:
# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose


# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib

pltargs = dict(s = 20,
                cmap = "magma",
                edgecolor = 'none',
                marker = 'o',
                )

# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,tail_profiles],
                          crd = data.real_crd,
                          rank_values = times.loc[tail_profiles,:].values.flatten(),
                          pltargs = pltargs,
                         )
plt.show()
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_10_0.png
[32]:
# Let us, as a subsequent step, sort
# these profiles into pattern families

# we will use the top 100 profiles
# to build our families
n_build = 5000

# use the "get_families" function
# to group the profiles
family_labels,repr_patterns = family.get_families(data.cnt.loc[:,sorted_profiles].values,
                                            n_base = n_build,
                                            n_sort = n_build,
                                            threshold = 0.80,
                                            )
# familiy label is am array of family indices
# for each of the profiles. We convert
# this to a data frame to make it easier to work with
families = pd.DataFrame(family_labels,
                        index = sorted_profiles[0:n_build],
                        columns = ['family'],
                        )
[INFO] : Using 2 eigenpatterns
[INFO] : Identified 2 families
/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
  warnings.warn(
[ ]:
# As displayed, 4 families were found. To get
# an idea of what type what type of spatial
# patterns that each family consists of
# we will plot their respective representative motifs

# use the "plot_representative" function
# for this purpose
fig, ax = family.plot_representative(motifs=repr_patterns,
                                 crd = data.real_crd,
                                 ncols = 3,
                                 pltargs=pltargs)
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_12_0.png
[43]:
# having done so, we se that the motifs differ to some extent
# and we may inspect all the members of each
# family

# use the "plot_family" function for this
family_plots= family.plot_families(data.cnt.loc[:,sorted_profiles[0:n_build]].values,
                           genes = sorted_profiles[0:n_build],
                           crd = data.real_crd,
                           labels = family_labels,
                           ncols = 10,
                           pltargs = pltargs,
                           side_size = 300,
                           )
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_0.png
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_1.png
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_2.png
../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_3.png
[ ]: