SpatiallyVariableGene_Sepal_Visium

[1]:

# Imports
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sepal
import sepal.datasets as d
import sepal.models as m
import sepal.utils as ut
import sepal.family as family
import sepal.enrich as fea

# pip install xxx
# ImportError: cannot import name 'GProfiler' from 'gprofiler' (/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/gprofiler/__init__.py)
# 把 ~/anaconda3/envs/pysodb/lib/python3.8/site-packages/sepal-1.0.0-py3.8.egg/sepal/enrich.py 中
# from gprofiler import GProfiler
# to
# from gprofiler import gprofiler as GProfiler
# refer:https://github.com/vals/python-gprofiler/issues/9

[2]:

import pysodb

[3]:

sodb = pysodb.SODB()

[15]:

dataset_name = '10x'
experiment_name = 'V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix'
adata = sodb.load_experiment(dataset_name,experiment_name)

download experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185M/185M [00:15<00:00, 12.9MB/s]

load experiment[V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix] in dataset[10x]

[16]:

adata.write_h5ad('Visium_pysodb.h5ad')

[17]:

# Set path to melanoma sample
# pth = "/home/yzy/software/sepal/data/real/melanoma.tsv"
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"


# load in the raw data using the RawData class
# pth = "/home/yzy/software/sepal/data/real/mob.tsv"
# raw_data = d.RawData(pth)



raw_data = d.RawData('Visium_pysodb.h5ad')


# filter genes observed in less than 5 spots
# and/or less than 10 total observations

raw_data.cnt = ut.filter_genes(raw_data.cnt,
                               min_expr=10,
                               min_occur=5)
# load into our our count data object
# this is a ST1K array, hence
# we use the ST1K class


data = m.VisiumData(raw_data,
              eps = 0.1)

[18]:

# We will normalize our count data
# and then propagate it in time, to measure
# the diffusion time.
# We set scale = True to perform
# minmax scaling of the diffusion times

# Running the simulation on 8 cores
# takes about ~2 minutes
times = m.propagate(data,
                    normalize = True,
                    scale =True)

[INFO] : Using 80 workers
[INFO] : Saturated Spots : 3095

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16278/16278 [01:21<00:00, 199.59it/s]

[26]:

# Now, once the diffusion times
# have been computed for all the
# profiles, we can inspect the
# top 20 highest ranked ones

n_top = 20
# get names of profiles
sorted_indices = np.argsort(times.values.flatten())
# argsort sorts from low to high, so reverse the order
sorted_indices = sorted_indices[::-1]
sorted_profiles = times.index.values[sorted_indices]
# extract top profiles
top_profiles = sorted_profiles[0:n_top]
tail_profiles = sorted_profiles[-n_top:-1]

# display top profiles and their times
times.loc[top_profiles,:]

[26]:

	average
Calb1	1.000000
Prkcg	0.960986
Gfap	0.946612
Apod	0.919918
Hpca	0.919918
Sst	0.915811
Car8	0.899384
Itpka	0.893224
Mgp	0.891170
Itpr1	0.887064
Dner	0.874743
Hbb-bt	0.872690
Gad1	0.862423
Igfbp2	0.856263
Vim	0.850103
Pcp4	0.841889
Nefm	0.835729
Gria1	0.833676
Fam107a	0.825462
Hba-a2	0.821355

[28]:

# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose


# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib

pltargs = dict(s = 20,
                cmap = "magma",
                edgecolor = 'none',
                marker = 'o',
                )

# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,top_profiles],
                          crd = data.real_crd,
                          rank_values = times.loc[top_profiles,:].values.flatten(),
                          pltargs = pltargs,
                         )
plt.show()

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_9_0.png

[29]:

# Now, let us also inspect these visually
# for this we can use the "plot_profiles"
# function for this purpose


# specifications for our
# plot. Same syntac
# as the scatter function
# in matplotlib

pltargs = dict(s = 20,
                cmap = "magma",
                edgecolor = 'none',
                marker = 'o',
                )

# plot the profiles
fig,ax = ut.plot_profiles(cnt = data.cnt.loc[:,tail_profiles],
                          crd = data.real_crd,
                          rank_values = times.loc[tail_profiles,:].values.flatten(),
                          pltargs = pltargs,
                         )
plt.show()

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_10_0.png

[32]:

# Let us, as a subsequent step, sort
# these profiles into pattern families

# we will use the top 100 profiles
# to build our families
n_build = 5000

# use the "get_families" function
# to group the profiles
family_labels,repr_patterns = family.get_families(data.cnt.loc[:,sorted_profiles].values,
                                            n_base = n_build,
                                            n_sort = n_build,
                                            threshold = 0.80,
                                            )
# familiy label is am array of family indices
# for each of the profiles. We convert
# this to a data frame to make it easier to work with
families = pd.DataFrame(family_labels,
                        index = sorted_profiles[0:n_build],
                        columns = ['family'],
                        )

[INFO] : Using 2 eigenpatterns
[INFO] : Identified 2 families

/home/yzy/anaconda3/envs/pysodb/lib/python3.8/site-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
  warnings.warn(

[ ]:

# As displayed, 4 families were found. To get
# an idea of what type what type of spatial
# patterns that each family consists of
# we will plot their respective representative motifs

# use the "plot_representative" function
# for this purpose
fig, ax = family.plot_representative(motifs=repr_patterns,
                                 crd = data.real_crd,
                                 ncols = 3,
                                 pltargs=pltargs)

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_12_0.png

[43]:

# having done so, we se that the motifs differ to some extent
# and we may inspect all the members of each
# family

# use the "plot_family" function for this
family_plots= family.plot_families(data.cnt.loc[:,sorted_profiles[0:n_build]].values,
                           genes = sorted_profiles[0:n_build],
                           crd = data.real_crd,
                           labels = family_labels,
                           ncols = 10,
                           pltargs = pltargs,
                           side_size = 300,
                           )

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_0.png

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_1.png

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_2.png

../_images/Test_the_new_data_SpatiallyVariableGene_Sepal_Visium_13_3.png

[ ]: