# No encoding supplied: defaulting to UTF-8 ?
# R Options
options(stringsAsFactors=FALSE,
citation_format="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
kableExtra_view_html=TRUE,
future.globals.maxSize=2000000000, mc.cores=1,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
# reticulate_python3_path = unname(Sys.which("python3"))
# Sys.setenv(RETICULATE_PYTHON=reticulate_python3_path)
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for "leiden" clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
use_condaenv("r-reticulate")
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform, glmGamPoi (optional for speed but only available for R 4.0)
# Tables: kableExtra, DT
# Plots: ggsci
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast, limma (for a more efficient implementation of the Wilcoxon Rank Sum Test according to Seurat)
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp, sceasy, ROpenSci/bibtex, knitcitations
# Knitr default options
::opts_chunk$set(echo=TRUE, # output code
knitrcache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source="fold-hide", # by default collapse code blocks
dev=c("png", "pdf"), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96, # figure resolution
fig.retina=2 # retina multiplier
)
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
invisible(knitcitations::citep(citation("knitr")))
The 10x dataset “1k Peripheral blood mononuclear cells (PBMCs)” is used to create two artifical samples with 100 cells and 500 genes each. Dataset: https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_1k_v3
This code chunk contains all parameters that are set specifically for the project.
= list()
param
####################
# Input parameters #
####################
# Project ID
$project_id = "pbmc_small"
param
# Path to input data
$path_data = data.frame(name=c("sample1", "sample2"),
paramtype=c("10x", "10x"),
path=c("/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample1/", "/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample2/"),
stats=c(NA, NA))
# Data type ("RNA", "Spatial")
$assay_raw = "RNA"
param
# Downsample data to at most n cells per sample (mainly for tests)
# NULL to deactivate
$downsample_cells_n = NULL
param
# Downsample data to n cells per sample for visualization
# NULL to deactivate
$fixed_cells_n = NULL
param
# Path to output directory
$path_out = "/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results/"
param
# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# NULL if no known marker genes should be plotted
$file_known_markers = NULL
param
# Annotation via biomaRt
$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
param$mart_attributes = c(param$annot_main,
paramc("chromosome_name", "start_position", "end_position",
"percentage_gene_gc_content", "gene_biotype", "strand", "description"))
$biomart_mirror = NULL
param
# Prefix for mitochondrial genes
$mt = "^MT-"
param
#####################
# Filter parameters #
#####################
# Filter for cells
$cell_filter = list(nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15))
param
# Filter for features
$feature_filter = list(min_counts=1, min_cells=5) # feature has to be found by at least one count in one cell
param
# Samples to drop
# Cells from these samples will be dropped after initial QC
# Example: param$samples_to_drop = c("pbmc_smartseq2_NC", "pbmc_smartseq2_RNA"),
# where "pbmc_smartseq2" is the name of the dataset, and "NC" and "RNA" are the names of the subsamples
$samples_to_drop = c()
param
# Drop samples with too few cells
$samples_min_cells = 10
param
############################
# Normalisation parameters #
############################
# Which normalisation should be used for analysis?
# "RNA" or "SCT" for standard single-cell RNA-seq data
# "SCT" for spatial data
$norm = "RNA"
param
# Whether or not to remove cell cycle effects
$cc_remove = FALSE
param
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
$cc_remove_all = FALSE
param
# Whether or not to re-score cell cycle effects after data
# from different samples have been merged/integrated
$cc_rescore_after_merge = TRUE
param
# Additional (unwanted) variables that will be regressed out for visualisation and clustering
$vars_to_regress = c()
param
# When there are multiple datasets, how to combine them:
# - method:
# - "single": Default when there is only one dataset after filtering, no integration is needed
# - "merge": Merge (in other words, concatenate) data when no integration is needed,
# e.g. when samples were multiplexed on the same chip.
# - "integrate": Anchors are computed for all pairs of datasets. This will give all datasets the same weight
# during dataset integration but can be computationally intensive
#
# Additional options for the "integrate" method:
#
# - dimensions: Number of dimensions to consider for integration
# - reference: Use one or more datasets as reference and compute anchors for all other datasets. Separate multiple datasets by comma
# This is computationally faster but less accurate.
# - use_reciprocal_pca: Compute anchors in PCA space. Even more computationally faster but again less accurate
# Recommended for big datasets.
# - k.filter: How many neighbors to use when filtering anchors (default: min(200, minimum number of cells in a sample))
# - k.weight: Number of neighbors to consider when weighting anchors (default: min(100, minimum number of cells in a sample))
# - k.anchor: How many neighbors to use when picking anchors (default: min(5, minimum number of cells in a sample))
# - k.score: How many neighbors to use when scoring anchors (default: min(30, minimum number of cells in a sample))
#
# Note that if you analyse assay_raw="Spatial", only method="merge" is supported
$integrate_samples = list(method="merge")
param
# TO DISCUSS from Seurat vignette:
# The results show that rpca-based integration is more conservative, and in this case, do not perfectly align a subset of cells (which are naive and memory T cells) across experiments. You can increase the strength of alignment by increasing the k.anchor parameter, which is set to 5 by default. Increasing this parameter to 20 will assist in aligning these populations.
# The number of PCs to use; adjust this parameter based on the Elbowplot
$pc_n = 10
param
# Resolution of clusters; low values will lead to fewer clusters of cells
$cluster_resolution=0.8
param$cluster_resolution_test=c(0.4, 0.6, 1.2)
param
#######################################################
# Marker genes and genes with differential expression #
#######################################################
# Thresholds to define marker genes
$marker_padj = 0.05
param$marker_log2FC = log2(2)
param$marker_pct = 0.25
param
# Additional (unwanted) variables to account for in statistical tests
$latent_vars = c()
param
# Contrasts to find differentially expressed genes (R data.frame or Excel file)
# Required columns:
# condition_column: Categorial column in the cell metadata; specify "orig.ident" for sample and "seurat_clusters" for cluster
# condition_group1: Condition levels in group 1, multiple levels concatenated by the plus character
# Empty string = all levels not in group2 (cannot be used if group2 is empty)
# condition_group2: Condition levels in group 2, multiple levels concatenated by the plus character
# Empty string = all levels not in group1 (cannot be used if group1 is empty)
#
# Optional columns:
# subset_column: Categorial column in the cell metadata to subset before testing (default: NA)
# Specify "orig.ident" for sample and "seurat_clusters" for cluster
# subset_group: Further subset levels (default: NA)
# For the individual analysis of multiple levels separate by semicolons
# For the joint analysis of multiple levels concatenate by the plus character
# For the individual analysis of all levels empty string ""
# assay: Seurat assay to test on; can also be a Seurat dimensionality reduction (default: "RNA")
# slot: In case assay is a Seurat assay object, which slot to use (default: "data")
# padj: Maximum adjusted p-value (default: 0.05)
# log2FC: Minimum absolute log2 fold change (default: 0)
# min_pct: Minimum percentage of cells expressing a gene to test (default: 0.1)
# test: Type of test; "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST", "DESeq2"; (default: "wilcox")
# downsample_cells_n: Downsample each group to at most n cells to speed up tests (default: NA)
# latent_vars: Additional variables to account for; multiple variables need to be concatenated by semicolons; will overwrite the default by param$latent_vars (default: none).
$deg_contrasts = NULL
param
# Enrichr site
# Available options: "Enrichr", "FlyEnrichr", "WormEnrichr", "YeastEnrichr", "FishEnrichr"
$enrichr_site = "Enrichr"
param
# P-value threshold for functional enrichment tests
$enrichr_padj = 0.05
param
# Enrichr databases of interest
$enrichr_dbs = c("GO_Biological_Process_2021", "KEGG_2021_Human", "WikiPathway_2021_Human")
param
######################
# General parameters #
######################
# Main colour to use for plots
$col = "palevioletred"
param
# Colour palette used for samples
$col_palette_samples = "ggsci::pal_rickandmorty"
param
# Colour palette used for cluster
$col_palette_clusters = "ggsci::pal_igv"
param
# Set dot size for umaps
$pt_size = 1
param
# Path to git repository
$path_to_git = "."
param
# Debugging mode:
# "default_debugging" for default, "terminal_debugger" for debugging without X11, "print_traceback" for non-interactive sessions
$debugging_mode = "default_debugging" param
# Git directory and files to source must be done first, then all helper functions can be sourced
= c("functions_io.R",
git_files_to_source "functions_plotting.R",
"functions_analysis.R",
"functions_degs.R",
"functions_util.R")
= file.path(param$path_to_git, "R", git_files_to_source)
git_files_to_source = purrr::map_lgl(git_files_to_source, file.exists)
file_exists if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
# Debugging mode:
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())
# Set output hooks
::knit_hooks$set(message=format_message, warning=format_warning)
knitr
# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "marker_degs"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "annotation"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "export"), recursive=TRUE, showWarnings=FALSE)
# Path for figures in png and pdf format (trailing "/" is needed)
::opts_chunk$set(fig.path=paste0(file.path(param$path_out, "figures"), "/"))
knitr
# Path for annotation
$file_annot = file.path(param$path_out, "annotation", paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
param
# Path for cell cycle genes file
$file_cc_genes = file.path(param$path_out, "annotation", "cell_cycle_markers.xlsx")
param
# Do checks
= c()
error_messages
# Check parameters and parse entries (e.g. numbers) so that they are valid
= check_parameters_scrnaseq(param)
param = c(error_messages, param[["error_messages"]])
error_messages # Check installed packages
= c(error_messages, check_installed_packages_scrnaseq())
error_messages # Check python
= c(error_messages, check_python())
error_messages # Check pandoc
= c(error_messages, check_pandoc())
error_messages # Check enrichR
= c(error_messages, check_enrichr(param$enrichr_dbs, param$enrichr_site))
error_messages # Check ensembl
= c(error_messages, check_ensembl(biomart="ensembl",
error_messages dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version,
attributes=param$mart_attributes,
file_annot=param$file_annot,
file_cc_markers=param$file_cc_genes))
# Stop here if there are errors
if (length(error_messages)) stop(paste(c("", paste("*", error_messages)), collapse="\n"))
The workflow can be run for pre-processed 10x data, as well as for pre-processed SmartSeq-2 or other data that are represented by a simple table with transcript counts per gene and cell.
# Are statistics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read mapping stats
= list()
mapping_stats_list for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
$path_data$name[i]]] = read.delim(param$path_data$stats[i],
mapping_stats_list[[paramsep=",", header=FALSE, check.names=FALSE) %>%
t() %>% as.data.frame()
}
}
# Join all mapping stats tables
= mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="V1")
mapping_stats rownames(mapping_stats) = mapping_stats[["V1"]]
= mapping_stats %>% dplyr::select(-V1)
mapping_stats colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
::kable(mapping_stats, align="l", caption="Mapping statistics") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"))
kableExtra
else {
} message("Mapping statistics cannot be shown. No valid file provided.")
}
× (Message)
Mapping statistics cannot be shown. No valid file provided.
We read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
= read.delim(param$file_annot)
annot_ensembl else {
} = suppressWarnings(GetBiomaRt(biomart="ensembl",
annot_mart dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version))
= biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
annot_ensembl write.table(annot_ensembl, file=param$file_annot, sep="\t", col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
= all(param$annot_main %in% colnames(annot_ensembl))
check_annot_main if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
= param$annot_main["ensembl"]
ensembl = param$annot_main["symbol"]
symbol = param$annot_main["entrez"]
entrez
# Ensembl id to gene symbol
= unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
ensembl_to_symbol
# Ensembl id to seurat-compatible unique rowname
= unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname[, symbol] = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
ensembl_to_seurat_rowname
# Seurat-compatible unique rowname to ensembl id
= setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
seurat_rowname_to_ensembl
# Ensembl to Entrez
= unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez[, entrez] = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
ensembl_to_entrez
# Seurat-compatible unique rowname to Entrez
= match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
seurat_rowname_to_ensembl_match names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
= purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
seurat_rowname_to_entrez
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
= annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
annot_ensembl rownames(annot_ensembl) = annot_ensembl[, ensembl]
# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
if (file.exists(param$file_cc_genes)) {
# Load from file
= openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
genes_g2m
else {
} # Obtain from Ensembl
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
= suppressWarnings(GetBiomaRt(biomart="ensembl",
mart_human dataset="hsapiens_gene_ensembl",
mirror=param$biomart_mirror,
version=param$annot_version))
= suppressWarnings(GetBiomaRt(biomart="ensembl",
mart_myspecies dataset=param$mart_dataset,
mirror=GetBiomaRtMirror(mart_human),
version=param$annot_version))
# S phase marker
= biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
genes_s filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
# G2/M marker
= biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
genes_g2m filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
# Write to file
::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m),file=param$file_cc_genes)
openxlsx
}
# Convert Ensembl ID to Seurat-compatible unique rowname
= data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_s = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id])) genes_g2m
Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell.
In addition to the count matrix, the workflow stores additional information in the Seurat object, including but not limited to the normalized data, dimensionality reduction and cluster results.# List of Seurat objects
= list()
sc
= param$path_data
datasets for (i in seq(nrow(datasets))) {
= datasets[i, "name"]
name = datasets[i, "type"]
type = datasets[i, "path"]
path = datasets[i, "suffix"]
suffix
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
= c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
sc
else if (type == "smartseq2") {
} # Read counts table into a Seurat object
= c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
sc
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
= names(sc)
sample_names = which(sample_names %in% sample_names[duplicated(sample_names)])
duplicated_sample_names_idx for (i in duplicated_sample_names_idx) {
= paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
sample_names[i] "orig.ident"]] = sample_names[i]
sc[[i]][[
}
# Set up colors for samples and add them to the sc objects
= purrr::flatten_chr(purrr::map(sc, function(s) {
sample_names = unique(as.character(s[[]][["orig.ident"]]))
nms return(nms)
}))$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
param= purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
sc
# Downsample cells if requested
if (!is.null(param$downsample_cells_n)) {
= purrr::map(sc, function(s) {
sc = ScSampleCells(sc=s, n=param$downsample_cells_n, seed=1)
cells return(subset(s, cells=cells))
})
}
sc
## $sample1
## An object of class Seurat
## 6000 features across 250 samples within 1 assay
## Active assay: RNA (6000 features, 0 variable features)
##
## $sample2
## An object of class Seurat
## 6000 features across 250 samples within 1 assay
## Active assay: RNA (6000 features, 0 variable features)
The following first table shows available metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), the number of mapped reads (“nCounts_RNA”), or the above mentioned number of unique genes detected (“nFeature_RNA”). The second table shows available metadata (columns) of the first 5 genes (rows).
# Combine cell metadata of the Seurat objects into one big metadata
= suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())
sc_cell_metadata
# Print cell metadata
::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%") kableExtra
orig.ident | nCount_RNA | nFeature_RNA | orig.dataset | |
---|---|---|---|---|
AACGAAACAGCGTTTA-1 | sample1 | 504 | 252 | sample1 |
GCTTCACCAACACGAG-1 | sample1 | 3098 | 769 | sample1 |
TAGTGCACAGAGGAAA-1 | sample1 | 2257 | 675 | sample1 |
AGAGCAGTCGCGCTGA-1 | sample1 | 963 | 339 | sample1 |
TGTTCATGTCCTGGGT-1 | sample1 | 986 | 363 | sample1 |
CTTGAGATCCCGAGGT-1 | sample1 | 103 | 20 | sample1 |
# Print gene metadata
::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%") kableExtra
feature_id | feature_name | feature_type | |
---|---|---|---|
AL627309.1 | ENSG00000238009 | AL627309.1 | Gene Expression |
AL627309.4 | ENSG00000241599 | AL627309.4 | Gene Expression |
AL732372.1 | ENSG00000236601 | AL732372.1 | Gene Expression |
FAM41C | ENSG00000230368 | FAM41C | Gene Expression |
AL390719.2 | ENSG00000272141 | AL390719.2 | Gene Expression |
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
$cell_filter = purrr::map(list_names(sc), function(s) {
paramif (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
else {
} return(param$cell_filter)
}
})
$feature_filter = purrr::map(list_names(sc), function(s) {
paramif (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
else {
} return(param$feature_filter)
} })
# Calculate percentage of counts in mitochondrial genes for each Seurat object
= purrr::map(sc, function(s) {
sc = grep(pattern=param$mt, rownames(s), value=TRUE)
mt_features return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
})
# Calculate percentage of counts in ribosomal genes for each Seurat object
= purrr::map(sc, function(s) {
sc = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
ribo_features return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
})
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
= purrr::map(sc, function(s) {
sc if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc
= suppressWarnings(purrr::map_dfr(sc, function(s){ return(s[[]]) }) %>% as.data.frame()) sc_cell_metadata
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
= purrr::map(list_names(sc), function(n) {
sc # Calculate percentage of counts per gene in a cell
= Seurat::GetAssayData(sc[[n]], slot="counts", assay=param$assay_raw)
counts_rna = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
total_counts = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
counts_rna_perc
# Calculate feature filters
= Matrix::rowSums(counts_rna >= 1)
num_cells_expr = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
num_cells_expr_threshold
# Calculate median of counts_rna_perc per gene
= apply(counts_rna_perc, 1, median)
counts_median
# Add all QC measures as metadata
$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
sc[[n]][[paramreturn(sc[[n]])
})
# Plot QC metrics for cells
= c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
cell_qc_features if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
= values_to_names(cell_qc_features)
cell_qc_features
= list()
p_list for (i in names(cell_qc_features)) {
= ggplot(sc_cell_metadata[, c("orig.ident", i)], aes_string(x="orig.ident", y=i, fill="orig.ident", group="orig.ident")) +
p_list[[i]]geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
= p_list[[i]] +
p_list[[i]] geom_point(data=sc_cell_metadata[, c("orig.ident", i)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes_string(x="orig.ident", y=i, fill="orig.ident"), shape=21, size=2)
# Now add styles
= p_list[[i]] +
p_list[[i]] AddStyle(title=i, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Creates a table with min/max values for filter i for each dataset
= purrr::map_dfr(names(param$cell_filter), function(n) {
cell_filter_for_plot # If filter i in cell filter of the dataset, then create dataframe with columns orig.ident, threshold and value
if (i %in% names(param$cell_filter[[n]])){
data.frame(orig.ident=n, threshold=c("min", "max"), value=param$cell_filter[[n]][[i]], stringsAsFactors=FALSE)
}
})
# Add filters as segments to plot
if (nrow(cell_filter_for_plot) > 0) {
# Remove entries that are NA
= cell_filter_for_plot %>% dplyr::filter(!is.na(value))
cell_filter_for_plot = p_list[[i]] + geom_segment(data=cell_filter_for_plot,
p_list[[i]] aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value),
lty=2, col="firebrick")
}
}= patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p p
# Correlate QC metrics for cells
= list()
p_list 1]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[2], y=cell_qc_features[1], colour="orig.ident")) +
p_list[[geom_point() +
AddStyle(col=param$col_samples)
2]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[3], y=cell_qc_features[1], colour="orig.ident")) +
p_list[[geom_point() +
AddStyle(col=param$col_samples)
= patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Features plotted against each other")
p
if (length(sc)==1) {
= p & theme(legend.position="bottom")
p else {
} = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
p
} p
We next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages in all cells. Genes with the highest median percentage of counts are plotted below.
# Plot only samples that we intend to keep
= names(sc)[!(names(sc) %in% param$samples_to_drop)]
sc_names = lapply(sc_names, function(i) {
genes_highestExpr = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
top_ten_exp return(rownames(top_ten_exp))
%>%
}) unlist() %>%
unique()
= purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ])
genes_highestExpr_counts $gene = genes_highestExpr
genes_highestExpr_counts= genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts $name = factor(genes_highestExpr_counts$name, levels=sc_names)
genes_highestExpr_counts
= GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
col = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) +
p geom_point() +
AddStyle(title="Top 10 highest expressed genes per sample, added into one list",
xlab="Sample", ylab="Median % of raw counts\n per gene in a cell",
legend_position="bottom",
col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p
Cells and genes are filtered based on the following thresholds:
= param$cell_filter %>% unlist(recursive=FALSE)
cell_filter_lst = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))
is_numeric_filter
# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
::invoke(rbind, cell_filter_lst[is_numeric_filter]) %>%
purrr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"))
kableExtra }
Min | Max | |
---|---|---|
sample1.nFeature_RNA | 200 | 8000 |
sample1.percent_mt | NA | 15 |
sample2.nFeature_RNA | 200 | 8000 |
sample2.percent_mt | NA | 15 |
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
::invoke(rbind, cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
purrr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"))
kableExtra
}
# gene filters
= param$feature_filter %>% unlist(recursive=FALSE)
feature_filter_lst if (length(feature_filter_lst) > 0) {
::invoke(rbind, feature_filter_lst) %>%
purrr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"))
kableExtra }
Value | |
---|---|
sample1.min_counts | 1 |
sample1.min_cells | 5 |
sample2.min_counts | 1 |
sample2.min_cells | 5 |
= purrr::map(list_names(sc), function(n) {
orig_feature_number length(rownames(sc[[n]]))})
= purrr::map(list_names(sc), function(n) {
orig_cell_number length(Cells(sc[[n]]))})
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
= purrr::map(list_names(sc), function(n) {
sc_cells_to_exclude = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter_result = param$cell_filter[[n]][[f]]
filter if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
= sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
idx_exclude return(names(which(idx_exclude)))
else if (is.character(filter)) {
} = !sc[[n]][[f, drop=TRUE]] %in% filter
idx_exclude return(Cells(sc[[n]])[idx_exclude])
}
})
# Samples to drop
if (n %in% param$samples_to_drop) {
"samples_to_drop"]] = colnames(sc[[n]])
filter_result[[else {
} "samples_to_drop"]] = as.character(c())
filter_result[[
}
# Minimum number of cells for a sample to keep
if (ncol(sc[[n]]) < param$samples_min_cells) {
"samples_min_cells"]] = colnames(sc[[n]])
filter_result[[else {
} "samples_min_cells"]] = as.character(c())
filter_result[[
}
return(filter_result)
})
# Summarise
= purrr::map_dfr(sc_cells_to_exclude, function(s) {
sc_cells_to_exclude_summary return(as.data.frame(purrr::map(s, length)))
})rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
= sc_cells_to_exclude_summary$nFeature_RNA[] %>% as.data.frame()
sc_cells_excluded_nFeature_RNA colnames(sc_cells_excluded_nFeature_RNA) = "nFeature_RNA"
rownames(sc_cells_excluded_nFeature_RNA) = names(list_names(sc))
= sc_cells_to_exclude_summary$percent_mt[] %>% as.data.frame()
sc_cells_excluded_percent_mt colnames(sc_cells_excluded_percent_mt) = "percent_mt"
rownames(sc_cells_excluded_percent_mt) = names(list_names(sc))
= orig_cell_number %>% as.data.frame()
sc_cells_orig rownames(sc_cells_orig) = "Original"
colnames(sc_cells_orig) = names(list_names(sc))
= as.data.frame(t(sc_cells_orig))
sc_cells_orig
= cbind.data.frame(sc_cells_excluded_nFeature_RNA, sc_cells_excluded_percent_mt, sc_cells_orig)
sc_cells_filter_summary
# Now filter, drop the respective colours and adjust integration method
= purrr::map(list_names(sc), function(n) {
sc = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
cells_to_keep if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
%>% purrr::discard(is.null)
})
= purrr::map(list_names(sc), function(n) {
filtered_cell_number length(Cells(sc[[n]]))})
= filtered_cell_number %>% as.data.frame()
sc_cells_filtered rownames(sc_cells_filtered) = "Filtered"
colnames(sc_cells_filtered) = names(list_names(sc))
= as.data.frame(t(sc_cells_filtered))
sc_cells_filtered
= cbind.data.frame(sc_cells_filter_summary, sc_cells_filtered)
sc_cells_filter_summary
= as.data.frame(sc_cells_filter_summary$Filtered[]/sc_cells_filter_summary$Original[]*100)
sc_cells_percent rownames(sc_cells_percent) = names(list_names(sc))
colnames(sc_cells_percent) = "Percent_remaining"
= cbind.data.frame(sc_cells_filter_summary, sc_cells_percent)
sc_cells_filter_summary
::kable(sc_cells_filter_summary, align="l", caption="Number of excluded cells") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
nFeature_RNA | percent_mt | Original | Filtered | Percent_remaining | |
---|---|---|---|---|---|
sample1 | 31 | 41 | 250 | 205 | 82.0 |
sample2 | 25 | 28 | 250 | 211 | 84.4 |
if (length(sc)==1) param$integrate_samples[["method"]] = "single"
# Only RNA assay at the moment
# Iterate over samples and record a feature if it does not pass the filter
= purrr::map(list_names(sc), function(n) {
sc_features_to_exclude
# Make sure the sample contains more cells than the minimum threshold
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
# Return gene names that do not pass the minimum threshold
else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)
sc_features_to_exclude
# Summarise
= purrr::map(sc_features_to_exclude, length) %>%
sc_features_to_exclude_summary t() %>% as.data.frame()
rownames(sc_features_to_exclude_summary) = c("Genes")
= as.data.frame(t(sc_features_to_exclude_summary))
sc_features_to_exclude_summary
= purrr::map(list_names(sc), function(n) {
sc_features_excluded $Genes[n]
sc_features_to_exclude_summary
}%>% as.data.frame()
) rownames(sc_features_excluded) = "Excluded"
colnames(sc_features_excluded) = names(list_names(sc))
= orig_feature_number %>% as.data.frame()
sc_features_orig rownames(sc_features_orig) = "Original"
colnames(sc_features_orig) = names(list_names(sc))
= rbind.data.frame(sc_features_orig, sc_features_excluded)
sc_feature_fiter_summary = as.data.frame(t(sc_feature_fiter_summary))
sc_feature_fiter_summary
= as.data.frame(100-(sc_feature_fiter_summary$Excluded[]/sc_feature_fiter_summary$Original[]*100))
sc_features_percent rownames(sc_features_percent) = names(list_names(sc))
colnames(sc_features_percent) = "Percent_remaining"
= cbind.data.frame(sc_feature_fiter_summary, sc_features_percent)
sc_feature_fiter_summary
::kable(sc_feature_fiter_summary, align="l", caption="Number of excluded genes") %>%
knitr::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Original | Excluded | Percent_remaining | |
---|---|---|---|
sample1 | 6000 | 3995 | 33.41667 |
sample2 | 6000 | 4014 | 33.10000 |
# Now filter those genes that are to be filtered for all samples
= purrr::map(list_names(sc), function(n) {
sc = Seurat::Assays(sc[[n]])
assay_names = purrr::map(values_to_names(assay_names), function(a) {
features_to_keep = rownames(sc[[n]][[a]])
features = features[!features %in% sc_features_to_exclude$AllSamples]
keep return(keep)
})return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})
After filtering, the size of the Seurat object is:
# Downsample cells if requested
if (!is.null(param$fixed_cells_n)) {
= purrr::map(sc, function(s) {
sc = colnames(s)
cells return(subset(s, cells=sample(cells, min(param$fixed_cells_n, length(cells)))))
})
message("The dataset has been downsampled to ", param$fixed_cells_n, " cells for better visualization.")
}
sc
## $sample1
## An object of class Seurat
## 2092 features across 205 samples within 1 assay
## Active assay: RNA (2092 features, 0 variable features)
##
## $sample2
## An object of class Seurat
## 2092 features across 211 samples within 1 assay
## Active assay: RNA (2092 features, 0 variable features)
The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed in the previous step, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal.
Count depth scaling is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor,” which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and are then typically multiplied by a factor of 10 (10,000 in this workflow).
Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.Normalisation method used for this report: RNA; with additional sources of variance regressed out: .
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalise data the original way
# This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
= purrr::map(sc, Seurat::NormalizeData, normalization.method="LogNormalize", scale.factor=10000, verbose=FALSE) sc
# Determine cell cycle effect per sample
= purrr::map(list_names(sc), function(n) {
sc = CCScoring(sc=sc[[n]], genes_s=genes_s[,2], genes_g2m=genes_g2m[,2], name=n)
sc[[n]] if (any(is.na(sc[[n]][["S.Score"]])) | any(is.na(sc[[n]][["G2M.Score"]]))) {
$cc_remove=FALSE
param$cc_remove_all=FALSE
param$cc_rescore_after_merge=FALSE
param
}return(sc[[n]])
})
× (Warning)
Warning in CCScoring(sc = sc[[n]], genes_s = genes_s[, 2], genes_g2m =
genes_g2m[, : There are not enough G2/M and S phase markers in the dataset
sample1 to reliably determine cell cycle scores and phases. Scores and phases
will be set to NA and removal of cell cycle effects is skipped.
× (Warning)
Warning in CCScoring(sc = sc[[n]], genes_s = genes_s[, 2], genes_g2m =
genes_g2m[, : There are not enough G2/M and S phase markers in the dataset
sample2 to reliably determine cell cycle scores and phases. Scores and phases
will be set to NA and removal of cell cycle effects is skipped.
# If cell cycle effects should be removed, we first score cells
# The effect is then removed in the following chunk
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
if (param$cc_remove_all) {
# Remove all signal associated to cell cycle
$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
paramelse {
} # Don't remove the difference between cycling and non-cycling cells
$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
param
} }
if (param$norm == "RNA") {
# Find variable features from normalised data (unaffected by scaling)
= purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
sc
# Scale
# Note: For a single dataset where no integration/merging is needed, all features can already be scaled here.
# Otherwise, scaling of all features will be done after integration/merging.
if (param$integrate_samples[["method"]]=="single") {
1]] = Seurat::ScaleData(sc[[1]],
sc[[features=rownames(sc[[1]][["RNA"]]),
vars.to.regress=param$vars_to_regress,
verbose=FALSE)
}else if (param$norm == "SCT") {
} # Run SCTransform
#
# This is a new normalisation method that replaces previous Seurat functions "NormalizeData", "FindVariableFeatures", and "ScaleData".
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
# Note: For a single dataset where no integration is needed, all features can already be scaled here.
# Otherwise, it is enough to scale only the variable features.
# Note: It is not guaranteed that all genes are successfully normalised with SCTransform.
# Consequently, some genes might be missing from the SCT assay.
# See: https://github.com/ChristophH/sctransform/issues/27
# Note: The performance of SCTransform can be improved by using "glmGamPoi" instead of "poisson" as method for initial parameter estimation.
= purrr::map(list_names(sc), function(n) {
sc SCTransform(sc[[n]],
assay=param$assay_raw,
vars.to.regress=param$vars_to_regress,
min_cells=param$feature_filter[[n]][["min_cells"]],
verbose=FALSE,
return.only.var.genes=!(param$integrate_samples[["method"]]=="single"),
method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson"))
}) }
= 5 * ceiling(length(names(sc))/2) fig_height_vf
= purrr::map(list_names(sc), function(n) {
p_list = head(Seurat::VariableFeatures(sc[[n]], assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw)), 10)
top10 = Seurat::VariableFeaturePlot(sc[[n]],
p assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw),
selection.method=ifelse(param$norm=="RNA", "vst", "sct"),
col=c("grey", param$col)) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
= LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
p return(p)
})
= patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
p p
if (param$integrate_samples[["method"]]!="single") {
# When merging, feature meta-data is removed by Seurat entirely; save separately for each assay except for SCT and add again afterwards
# Note: not sure whether this is still needed - discuss
= setdiff(unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } ))), "SCT")
assay_names
# Loop through all assays and accumulate meta data
= purrr::map(values_to_names(assay_names), function(a) {
sc_feature_metadata # "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
# This step is skipped for assays that do not contain all three types of feature information
= purrr::map_lgl(list_names(sc), function(n) {
contains_neccessary_columns all(c("feature_id", "feature_name", "feature_type") %in% colnames(sc[[n]][[a]][[]]))
})
if (all(contains_neccessary_columns)) {
= purrr::map(sc, function(s) return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) )
feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
feature_id_name_type = which(!rownames(df_y) %in% rownames(df_x))
new_rows if (length(new_rows) > 0) return(rbind(df_x, df_y[new_rows, ]))
else return(df_x)
})$row_names = rownames(feature_id_name_type)
feature_id_name_typeelse {
} = NULL
feature_id_name_type
}
# For all other meta-data, we prefix column names with the dataset
= purrr::map(list_names(sc), function(n) {
other_feature_data = sc[[n]][[a]][[]]
df if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id", "feature_name", "feature_type"), c()))
if (ncol(df) > 0) colnames(df) = paste(n, colnames(df), sep=".")
$row_names = rownames(df)
dfreturn(df)
})
# Now join everything by row_names by full outer join
if (!is.null(feature_id_name_type)) {
= purrr::reduce(c(list(feature_id_name_type=feature_id_name_type), other_feature_data), dplyr::full_join, by="row_names")
feature_data else {
} = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
feature_data
}rownames(feature_data) = feature_data$row_names
$row_names = NULL
feature_data
return(feature_data)
})
# When merging, cell meta-data are merged but factors are not kept
= suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
sc_cell_metadata = purrr::map(which(sapply(sc_cell_metadata, is.factor)), function(n) {
sc_cell_metadata_factor_levels return(levels(sc_cell_metadata[, n, drop=TRUE]))
}) }
# Data for different samples can be merged if no integration is needed,
# for example, when samples were multiplexed on the same chip
if (param$integrate_samples[["method"]]=="merge") {
= merge(x=sc[[1]], y=sc[2:length(sc)], project=param$project_id)
sc
# Re-score cell cycle effects after merge
if (param$cc_rescore_after_merge) {
= CCScoring(sc=sc, genes_s=genes_s[,2], genes_g2m=genes_g2m[,2])
sc if (any(is.na(sc[["S.Score"]])) | any(is.na(sc[["G2M.Score"]]))) {
$cc_remove=FALSE
param$cc_remove_all=FALSE
param$cc_rescore_after_merge=FALSE
param$vars_to_regress = setdiff(param$vars_to_regress, c("S.Score", "G2M.Score", "CC.Difference"))
param$latent_vars = setdiff(param$latent_vars, c("S.Score", "G2M.Score", "CC.Difference"))
param
}
}
# (Re-)Run normalisation, variable features and scaling
if (param$norm == "RNA") {
# Find variable features in RNA assay
= Seurat::FindVariableFeatures(sc, selection.method="vst", nfeatures=3000, verbose=FALSE)
sc
# Scale RNA assay
# Note: Removing cell cycle effects in scaled data can be very slow
= Seurat::ScaleData(sc, features=rownames(sc[[param$assay_raw]]), vars.to.regress=param$vars_to_regress, verbose=FALSE, assay=param$assay_raw)
sc
else if (param$norm == "SCT") {
} # Rerun SCTransform
= max(purrr::map_int(param$feature_filter, function(f) as.integer(f[["min_cells"]])))
min_cells_overall = suppressWarnings(SCTransform(sc,
sc assay=param$assay_raw,
vars.to.regress=param$vars_to_regress,
min_cells=min_cells_overall,
verbose=FALSE,
return.only.var.genes=FALSE,
method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson")))
}
# Add feature metadata
for (a in Seurat::Assays(sc)) {
if (a %in% names(sc_feature_metadata)) {
= Seurat::AddMetaData(sc[[a]], sc_feature_metadata[[a]][rownames(sc[[a]]),, drop=FALSE])
sc[[a]]
}
}
# Fix cell metadata factors
for (f in names(sc_cell_metadata_factor_levels)) {
= factor(sc[[f, drop=TRUE]], levels=sc_cell_metadata_factor_levels[[f]])
sc[[f]]
}
# Add sample/dataset colours again to misc slot
= ScAddLists(sc, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
sc
message("Data values for all samples have been merged. This means that data values have been concatenated, not integrated.")
print(sc)
}
× (Warning)
Warning in CCScoring(sc = sc, genes_s = genes_s[, 2], genes_g2m = genes_g2m[, :
There are not enough G2/M and S phase markers in the dataset to reliably
determine cell cycle scores and phases. Scores and phases will be set to NA and
removal of cell cycle effects is skipped.
× (Message)
Data values for all samples have been merged. This means that data values have been concatenated, not integrated.
## An object of class Seurat
## 2092 features across 416 samples within 1 assay
## Active assay: RNA (2092 features, 2092 variable features)
= 100
n_cells_rle_plot
# Sample at most 100 cells per dataset and save their identity
= sc[["orig.ident"]] %>% tibble::rownames_to_column() %>%
cells_subset ::group_by(orig.ident) %>%
dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>%
dplyr::select(rowname, orig.ident) dplyr
To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#FAFD7CFF, #82491EFF)
# Plot raw data
= PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$assay_raw, slot="counts") + 1)),
p id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="log2(raw counts + 1)")
p
Dependent on the context, this tab refers to different data:
# Plot normalised data
= PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw), slot="data")),
p id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="Normalised data")
p
if (! param$integrate_samples[["method"]] %in% c("single", "merge")) {
# Plot integrated data
= PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=paste0(param$norm, "integrated"), slot="data")),
p id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="Integrated data")
pelse {
} message("No integrated data available.")
}
× (Message)
No integrated data available.
A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.
We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.
In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.To decide how many PCs to include in downstream analyses, we visualise the cells and genes that define the PCA.
# Run PCA for default normalisation
= Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc))) sc
= Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
p_list for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
= patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
p p
= Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) +
p AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
p
= Seurat::DimHeatmap(sc, dims=1:min(20, ncol(sc)), cells=min(500, ncol(sc)), balanced=TRUE, fast=FALSE, combine=FALSE)
p_list = purrr::map(seq(p_list), function(i) {
p_list = p_list[[i]] +
p_list[[i]] ggtitle(paste("PC", i)) +
theme(legend.position="none", axis.text.y=element_text(size=8))
return(p_list[[i]])
})= patchwork::wrap_plots(p_list, ncol=3) + patchwork::plot_annotation("Top gene loadings of the first PCs")
p p
# More approximate technique used to reduce computation time
= Seurat::ElbowPlot(sc, ndims=min(20, ncol(sc))) +
p geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) +
AddStyle(title="Elbow plot")
p
# Cannot have more PCs than number of cells
$pc_n = min(param$pc_n, ncol(sc)) param
For the current dataset, 10 PCs were chosen.
At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.
During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.We use the non-linear dimensional reduction technique UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see Unknown (2022b).
# Construct phylogenetic tree relating the "average" cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
= suppressWarnings(Seurat::BuildClusterTree(sc, features=rownames(sc), verbose=FALSE))
sc ::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "Seurat::BuildClusterTree"))
Seurat
}
# The number of clusters can be optimized by tuning "resolution" -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
= Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE) sc
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
for (i in param$cluster_resolution_test) {
= Seurat::FindClusters(sc, resolution=i, algorithm=4, verbose=FALSE, method="igraph")
sc
# Construct phylogenetic tree relating the "average" cell from each cluster
if (length(levels(sc$seurat_clusters)) > 1) {
= suppressWarnings(Seurat::BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
sc suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Tool(sc, "Seurat::BuildClusterTree")))})
}
# Set up colors for clusters and add the sc object
$col_clusters = GenerateColours(num_colours=length(levels(sc$seurat_clusters)), names=levels(sc$seurat_clusters),
parampalette=param$col_palette_clusters, alphas=1)
= ScAddLists(sc, lists=list(seurat_clusters=param$col_clusters), lists_slot="colour_lists")
sc
# Default UMAP
= suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot"))
sc
# Make a UMAP that is coloured by cluster
= table(sc@active.ident)
cluster_cells = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
cluster_labels = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
p scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title=paste0("Cluster resolution: ",i), legend_position="bottom", legend_title="Clusters") +
FontSize(x.text = 10, y.text = 10, x.title = 13, y.title = 13, main = 15)
= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p
if (i == param$cluster_resolution_test[1]) {
= p
p_list else {
} = p_list + p
p_list
}
}
# p = p_list + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity;")
= patchwork::wrap_plots(p_list) + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity;", theme = theme(plot.title = element_text(size = 15)) )
p p
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
= Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4, verbose=FALSE, method="igraph")
sc
# Construct phylogenetic tree relating the "average" cell from each cluster
if (length(levels(sc$seurat_clusters)) > 1) {
= suppressWarnings(Seurat::BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
sc suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Tool(sc, "Seurat::BuildClusterTree")))})
}
# Set up colors for clusters and add the sc object
$col_clusters = GenerateColours(num_colours=length(levels(sc$seurat_clusters)), names=levels(sc$seurat_clusters),
parampalette=param$col_palette_clusters, alphas=1)
= ScAddLists(sc, lists=list(seurat_clusters=param$col_clusters), lists_slot="colour_lists") sc
# Default UMAP
= suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot")) sc
# Make a UMAP that is coloured by cluster
= table(sc@active.ident)
cluster_cells = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
cluster_labels = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
p scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p p
# Plot all samples separately
= Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
p scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p p
# Make a UMAP that is coloured by sample of origin
= Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, cols=param$col_samples) +
p AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p p
# Plot all samples separately
= Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, cols=param$col_samples, split.by = "orig.ident", ncol = 2) +
p AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p p
# Count cells per cluster per sample
= sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_samples = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()
cell_clusters
= dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n, values_fill=0) %>% as.data.frame()
tbl rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
"orig.ident"] = NULL
tbl[,
# Add percentages
= round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
tbl_perc rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
= rbind(tbl, tbl_perc)
tbl
# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters) > 1) tbl = rbind(tbl, CellsFisher(sc))
# Sort
= tbl[order(rownames(tbl)),, drop=FALSE]
tbl
# Plot percentages
= tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>%
tbl_bar ::rownames_to_column(var="Sample") %>%
tibble::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tidyr$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
tbl_bar= ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) +
p geom_bar(stat="identity" ) +
AddStyle(title="Percentage cells of samples in clusters",
fill=param$col_samples,
legend_title="Sample",
legend_position="bottom")
p
The following table shows the number of cells per sample and cluster:
In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:
# Print table
::kable(tbl, align="l", caption="Number of cells per sample and cluster") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%") kableExtra
Cl_1 | Cl_2 | Cl_3 | Cl_4 | |
---|---|---|---|---|
sample1_n | 78 | 62 | 35 | 30 |
sample1_perc | 54.55 | 49.21 | 46.05 | 42.25 |
sample1.oddsRatio | 1.38 | 1 | 0.85 | 0.71 |
sample1.p | 7.3e-02 | 5.5e-01 | 7.7e-01 | 9.2e-01 |
sample2_n | 65 | 64 | 41 | 41 |
sample2_perc | 45.45 | 50.79 | 53.95 | 57.75 |
sample2.oddsRatio | 0.73 | 1 | 1.17 | 1.41 |
sample2.p | 9.5e-01 | 5.3e-01 | 3.1e-01 | 1.2e-01 |
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = ifelse(param$norm=="SCT", param$norm, param$assay_raw)
How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.
# Set up colours for cell cycle effect and add to sc object
= GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
col = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")
sc
# Get a feeling for how many cells are affected
= ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
p1 geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score") +
AddStyle(col=Misc(sc, "colour_lists")[["Phase"]])
= ggplot(sc@meta.data %>%
p2 ::group_by(seurat_clusters, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
dplyraes(x=seurat_clusters, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]])
= ggplot(sc[[]] %>%
p3 ::group_by(orig.ident, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
dplyraes(x=orig.ident, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")
= p1 + p2 + p3 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases") + plot_layout(guides="collect")
p p
× (Warning) Removed 416 rows containing missing values (geom_point).
if (any(!is.na(sc$Phase))) {
# UMAP with phases superimposed
# Note: This is a hack to colour by phase but label by Cluster
= Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) +
p AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p= LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p
p }
if (any(!is.na(sc$Phase))) {
= Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
p AddStyle(title="UMAP, cells coloured by S phase")
= LabelClusters(p, id="ident", box=TRUE, fill="white")
p
p }
if (any(!is.na(sc$Phase))) {
= Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
p AddStyle(title="UMAP, cells coloured by G2M phase")
= LabelClusters(p, id="ident", box=TRUE, fill="white")
p
p }
if (any(!is.na(sc$Phase))) {
= Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
p AddStyle(title="UMAP, cells coloured by CC.Difference")
= LabelClusters(p, id="ident", box=TRUE, fill="white")
p
p }
Do cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
= paste0("nCount_", param$assay_raw)
qc_feature = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) +
p1 AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
= LabelClusters(p1, id="ident", box=FALSE)
p1
= ggplot(sc[[]], aes_string(x="seurat_clusters", y=qc_feature, fill="seurat_clusters", group="seurat_clusters")) +
p2 geom_violin(scale="width") +
AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
= p1 | p2
p = p + patchwork::plot_annotation(title=paste0("Summed raw counts (", qc_feature, ", log10 scale)"))
p p
= paste0("nFeature_", param$assay_raw)
qc_feature = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) +
p1 AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
= LabelClusters(p1, id="ident", box=FALSE)
p1
= ggplot(sc[[]], aes_string(x="seurat_clusters", y=qc_feature, fill="seurat_clusters", group="seurat_clusters")) +
p2 geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
= p1 | p2
p = p + patchwork::plot_annotation(title=paste0("Number of features with raw count > 0 (", qc_feature, ", log10 scale)"))
p p
= Seurat::FeaturePlot(sc, features="percent_mt", cols=c("lightgrey", param$col)) +
p1 AddStyle(title="Feature plot")
= LabelClusters(p1, id="ident", box=FALSE)
p1
= ggplot(sc[[]], aes(x=seurat_clusters, y=percent_mt, fill=seurat_clusters, group=seurat_clusters)) +
p2 geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
= p1 | p2
p = p + patchwork::plot_annotation(title="Percent of mitochondrial features (percent_mt)")
p p
= Seurat::FeaturePlot(sc, features="percent_ribo", cols=c("lightgrey", param$col)) +
p1 AddStyle(title="Feature plot")
= LabelClusters(p1, id="ident", box=FALSE)
p1
= ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ribo, fill=seurat_clusters, group=seurat_clusters)) +
p2 geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
= p1 | p2
p = p + patchwork::plot_annotation(title="Percent of ribosomal features (percent_ribo)")
p p
Do cells in individual clusters express provided known marker genes?
=c()
known_markers_list
# Overwrite empty list of known markers
if (!is.null(param$file_known_markers)) {
# Read known marker genes and map to rownames
= openxlsx::read.xlsx(param$file_known_markers)
known_markers = lapply(colnames(known_markers), function(x) {
known_markers_list = ensembl_to_seurat_rowname[known_markers[,x]] %>%
y na.exclude() %>% unique() %>% sort()
= !y %in% rownames(sc)
m if (any(m)){
Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", first_n_elements_to_string(y[m], n=10)))
}return(y[!m])
})
# Remove empty lists
names(known_markers_list) = colnames(known_markers)
= purrr::map_int(known_markers_list, .f=length) == 0
is_empty = known_markers_list[!is_empty]
known_markers_list
# Add lists to sc object
= ScAddLists(sc, lists=setNames(known_markers_list, paste0("known_marker_", names(known_markers_list))), lists_slot="gene_lists")
sc
}
# Set plot options
if(length(known_markers_list) > 0) {
= length(known_markers_list)
known_markers_n = unlist(known_markers_list) %>% unique() %>% sort()
known_markers_vect = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) <= 50)
idx_dotplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
idx_avgplot else {
} =0
known_markers_n= idx_avgplot = FALSE
idx_dotplot = c()
known_markers_vect }
# Dotplots and average feature plots
# The height of 1 row (= 1 plot) is fixed to 5
= max(5, 5 * sum(idx_dotplot))
fig_height_knownMarkers_dotplot = max(5, 5 * sum(idx_avgplot))
fig_height_knownMarkers_avgplot
# Individual feature plots
# Each row contains 2 plots
# We fix the height of each plot to the same height as is used later for DEGs
= max(2, 0.3 * length(levels(sc$seurat_clusters)))
height_per_row = ceiling(length(known_markers_vect)/2)
nr_rows = max(5, height_per_row * nr_rows) fig_height_knownMarkers_vect
You provided 0 list(s) of known marker genes. In the following tabs, you find:
A dot plot visualises how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene (column), we group cells based on cluster identity (rows), calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.
if ((known_markers_n > 0) & any(idx_dotplot)) {
= known_markers_list[idx_dotplot]
known_markers_dotplot = list()
p_list for (i in seq(known_markers_dotplot)) {
= known_markers_dotplot[[i]]
g = g[length(g):1]
g = suppressMessages(
p_list[[i]] ::DotPlot(sc, features=g) +
Seuratscale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]), ylab="Cluster") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
lims(size=c(0,100))
)
}= patchwork::wrap_plots(p_list, ncol=1)
p
pelse if ((known_markers_n > 0) & !any(idx_dotplot)) {
} message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
else {
} message("No known marker genes were provided and hence dot plots are skipped.")
}
× (Message)
No known marker genes were provided and hence dot plots are skipped.
An average feature plot visualises the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.
if ((known_markers_n > 0) & any(idx_avgplot)) {
= known_markers_list[idx_avgplot]
known_markers_avgplot = Seurat::AddModuleScore(sc, features=known_markers_avgplot, ctrl=10, name="known_markers")
sc = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
idx_replace_names colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
= Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
p_list for (i in seq(known_markers_avgplot)) {
= p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
p_list[[i]]
}= patchwork::wrap_plots(p_list, ncol=1)
p print(p)
else if ((known_markers_n > 0) & !any(idx_avgplot)) {
} message("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
else {
} message("No known marker genes were provided and hence average feature plots are skipped.")
}
× (Message)
No known marker genes were provided and hence average feature plots are skipped.
An individual feature plot colours single cells on the UMAP according to their normalised gene expression.
if ((known_markers_n > 0) & length(known_markers_vect) <= 100) {
= Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
p_list for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
= patchwork::wrap_plots(p_list, ncol=2)
p print(p)
else if (length(known_markers_vect) > 100) {
} message("This tab is used to plot up to 100 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
else {
} message("No known marker genes were provided and hence individual feature plots are skipped.")
}
× (Message)
No known marker genes were provided and hence individual feature plots are skipped.
As described above, cell clusters approximate cell types and states. But how do we know which cell types these are? To characterize cell clusters, we identify marker genes. Good marker genes are genes that are particularly expressed in one cluster, and existing knowledge of these marker genes can be used to extrapolate biological functions for the cluster. A good clustering of cells typically results in good marker genes. Hence, if you cannot find good marker genes you may need to go back to the start of the workflow and adapt your parameters. Note that we also determine genes that are particularly down-regulated in one cluster, even if these are not marker genes in the classical sense.
Good marker genes are highly and possibly even only expressed in one cluster as compared to all other clusters. However, sometimes marker genes are also expressed in other clusters, or are declared as marker genes in these clusters, for example cell lineage markers that are shared by different cell subtypes. To evaluate marker genes, it is essential to visualize their expression patterns.
In addition to detecting marker genes, it might be informative to detect genes that are differentially expressed between one specific cluster and one or several other clusters. This approach allows a more specific distinction of individual clusters and investigation of more subtle differences, see the section “Differentially expressed genes” below.# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA"/"Spatial" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
# https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
= suppressMessages(Seurat::FindAllMarkers(sc, assay=param$assay_raw, test.use="MAST",
markers only.pos=FALSE, min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))
# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))
# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
= grep("avg_log\\S*FC", colnames(markers))
lfc_idx = marker_deg_results[,lfc_idx] / log(2)
markers[,lfc_idx] = colnames(markers)
col_nms 2] = "avg_log2FC"
col_nms[colnames(markers) = col_nms
}
# Sort markers
= markers %>% DegsSort(group=c("cluster"))
markers
# Filter markers
= DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_filt = nrow(markers_filt$all)>0
markers_found
# Add average data to table
= cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene, assay=param$assay_raw))
markers_out
# Split by cluster and write to file
= data.frame(Column=c("cluster",
additional_readme "p_val_adj_score",
"avg_<assay>_<slot>_id<cluster>"),
Description=c("Cluster",
"Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
"Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))
invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
additional_readme=additional_readme,
file=file.path(param$path_out, "marker_degs", "markers_cluster_vs_rest.xlsx")))
# Plot number of differentially expressed genes
= DegsPlotNumbers(markers_filt$all,
p group="cluster",
title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")"))
# Add marker table to seurat object
::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj,
Seuratlog2FC=param$marker_log2FC, min_pct=param$marker_pct, assay=param$assay_raw, slot="data",
latent_vars=param$latent_vars,
results=markers_filt$all)
# Add marker lists to seurat object
= split(markers_filt$up$gene, markers_filt$up$cluster)
marker_genesets_up names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
= split(markers_filt$down$gene, markers_filt$down$cluster)
marker_genesets_down names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
= ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")
sc
if (markers_found) {
pelse {
} warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}
We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.
if (markers_found) {
= DegsUpDisplayTop(markers_filt$up, n=5)
markers_top
# Add labels
$labels = paste0(markers_top$cluster, ": ", markers_top$gene)
markers_top
# Show table
::kable(markers_top %>% dplyr::select(-labels), align="l", caption="Up to top 5 marker genes per cell cluster") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")
kableExtra }
cluster | gene | avg_log2FC | p_val | p_val_adj | pct.1 | pct.2 |
---|---|---|---|---|---|---|
1 | CTSS | 4.604 | 3.4e-138 | 7.1e-135 | 0.993 | 0.524 |
1 | CPVL | 5.251 | 3.5e-101 | 7.3e-98 | 0.958 | 0.007 |
1 | TYROBP | 3.526 | 5.6e-80 | 1.2e-76 | 0.972 | 0.139 |
1 | SRGN | 2.659 | 7.6e-74 | 1.6e-70 | 1.000 | 0.531 |
1 | LMO2 | 3.429 | 2.1e-73 | 4.3e-70 | 0.909 | 0.059 |
2 | CD3E | 1.778 | 2.1e-48 | 4.4e-45 | 0.976 | 0.252 |
2 | TRABD2A | 2.929 | 1.4e-37 | 2.9e-34 | 0.627 | 0.066 |
2 | MAL | 2.984 | 3.4e-37 | 7.1e-34 | 0.571 | 0.024 |
2 | LCK | 1.400 | 5.5e-31 | 1.2e-27 | 0.881 | 0.279 |
2 | RCAN3 | 2.255 | 1.8e-27 | 3.8e-24 | 0.667 | 0.197 |
3 | IGHM.1 | 7.438 | 2.5e-69 | 5.2e-66 | 0.882 | 0.021 |
3 | CD22 | 4.501 | 2.4e-59 | 5.1e-56 | 0.829 | 0.015 |
3 | TNFRSF13C | 3.784 | 2.4e-41 | 4.9e-38 | 0.684 | 0.021 |
3 | EAF2 | 2.855 | 9.9e-31 | 2.1e-27 | 0.658 | 0.165 |
3 | HLA-DPA1.2 | 1.711 | 5.8e-30 | 1.2e-26 | 1.000 | 0.509 |
4 | HCST | 2.004 | 3.3e-37 | 6.9e-34 | 0.986 | 0.693 |
4 | GNLY | 7.570 | 6.5e-35 | 1.4e-31 | 0.577 | 0.023 |
4 | SAMD3 | 2.781 | 2.7e-25 | 5.7e-22 | 0.662 | 0.110 |
4 | ABHD17A | 1.748 | 2.2e-22 | 4.6e-19 | 0.873 | 0.528 |
4 | EOMES | 2.773 | 1.4e-21 | 2.9e-18 | 0.394 | 0.006 |
The following plots visualise the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.
# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {
# Feature plots and violin plots: each row contains 5 plots
# Row height is not dependent on the number of clusters
# The plot has 5 columns and 1 row per cluster, hence the layout works nicely if we find
# at least 5 markers per cluster
= ceiling(nrow(markers_top)/5)
nr_rows_5cols = nr_rows_5cols * 3
fig_height_5cols
# Dotplots: each row contains 2 plots
# Row height is dependent on the number of clusters
= ceiling(length(levels(sc$seurat_clusters))/2)
nr_rows_dp_2cols = nr_rows_dp_2cols * max(4, 0.5 * length(levels(sc$seurat_clusters)))
fig_height_dp_2cols
else {
} = fig_height_dp_2cols = 7
fig_height_5cols }
if (markers_found) {
# Plot each marker one by one, and then combine them all at the end
= list()
p_list for (i in 1:nrow(markers_top)) {
= Seurat::FeaturePlot(sc, features=markers_top$gene[i],
p_list[[i]] cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),
combine=TRUE, label=TRUE) +
AddStyle(title=markers_top$labels[i],
xlab="", ylab="",
legend_position="bottom")
}
# Combine all plots
= patchwork::wrap_plots(p_list, ncol=5) +
p ::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
patchwork
p }
if (markers_found) {
# Plot violin plots per marker gene, and combine it all at the end
# This layout works out nicely if there are 5 marker genes per cluster
= list()
p_list for(i in 1:nrow(markers_top)) {
= Seurat::VlnPlot(sc, features=markers_top$gene[i], assay=param$assay_raw, pt.size=0, cols=param$col_clusters) +
p_list[[i]] AddStyle(title=markers_top$labels[i], xlab="")
}= patchwork::wrap_plots(p_list, ncol=5) +
p ::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
patchwork
p }
if (markers_found) {
# Visualises how feature expression changes across different clusters
# Plot dotplots per cluster, and combine it all at the end
= lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
p_list = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
genes = suppressMessages(Seurat::DotPlot(sc, features=genes) +
p scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Top marker genes for cluster ", cl, " (scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
})
= patchwork::wrap_plots(p_list, ncol=2)
p
p }
if (markers_found) {
# Visualises how feature expression changes across different clusters
# Plot dotplots per cluster, and combine it all at the end
= lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
p_list = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
genes = genes[length(genes):1]
genes = suppressMessages(DotPlotUpdated(sc, features=genes, scale=FALSE, cols=c("lightgrey", param$col)) +
p AddStyle(title=paste0("Top marker genes for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
})
= patchwork::wrap_plots(p_list, ncol=2)
p
p }
If the dataset contains multiple samples, we can visualise the expression of a gene that is up-regulated in a cluster separately for each sample. For each cluster, we extract up-regulated genes, and visualise expression of these genes in all cells in that cluster, split by their sample of origin.
= max(5,
fig_height_degs_per_cl max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 2)
First, we plot scaled expression as explained above (see section Known marker genes). This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples.
if (markers_found) {
= list()
p_list = DegsUpDisplayTop(degs=markers_filt$up, n=50)
markers_filt_up_top for (cl in levels(sc$seurat_clusters)) {
= markers_filt_up_top %>%
markers_filt_up_cl_top ::filter(cluster==cl) %>%
dplyr::pull(gene)
dplyr
if (length(markers_filt_up_cl_top) > 0) {
= suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
p_list[[cl]] scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Up to 50 markers (up-regulated genes) for cluster ", cl), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
}
}= patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster")
p
p }
Second, we plot normalised expression with no further scaling. This plot helps to get an impression of the total expression of a gene.
if (markers_found) {
= 50
n_genes_max_dotplot = list()
p_list for (cl in levels(sc$seurat_clusters)) {
= markers_filt$up %>%
markers_filt_up_cl_top ::filter(cluster==cl) %>%
dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>%
dplyr::pull(gene)
dplyrif (length(markers_filt_up_cl_top) > 0) {
= DotPlotUpdated(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("lightgrey", param$col)) +
p_list[[cl]] AddStyle(title=paste0("Up to ", n_genes_max_dotplot, " markers (up-regulated genes) for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1))
}
}
= patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster (not scaled)")
p
p }
if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
= ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
cells_subset
# Heatmap of all differentially expressed genes
= Seurat::DoHeatmap(sc, features=markers_filt$all$gene, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
p NoLegend() +
theme(axis.text.y=element_blank()) +
ggtitle("Heatmap of scaled gene expression data, all genes differentially expressed between a cluster and the rest")
p }
if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
= ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
cells_subset # With fig.height = 20, 300 features can be shown; distribute among clusters
= 300/length(levels(markers_filt$up$cluster))
features_per_group = markers_filt$up %>%
features_subset ::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=avg_log2FC) %>%
dplyr::arrange(cluster, -avg_log2FC) %>%
dplyr::pull(gene) %>%
dplyrunique()
# Heatmap of top up-regulated genes
= Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
p NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p }
if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
= ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
cells_subset # With fig.height = 20, 300 features can be shown; distribute among clusters
= 300/length(levels(markers_filt$down$cluster))
features_per_group = markers_filt$down %>%
features_subset ::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=-avg_log2FC) %>%
dplyr::arrange(cluster, avg_log2FC) %>%
dplyr::pull(gene) %>%
dplyrunique()
# Heatmap of top down-regulated genes
= Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
p NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p }
To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Unknown 2022c). You can choose to test functional enrichment from a wide range of databases:
# Set Enrichr database
::setEnrichrSite(param$enrichr_site) enrichR
× (Message)
Connection changed to https://maayanlab.cloud/Enrichr/
× (Message)
Connection is Live!
# List databases
= enrichR::listEnrichrDbs()
dbs_all ::kable(dbs_all, align="l", caption="Enrichr databases") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px") kableExtra
geneCoverage | genesPerTerm | libraryName | link | numTerms | appyter | categoryId |
---|---|---|---|---|---|---|
13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 | ea115789fcbf12797fd692cec6df0ab4dbc79c6a | 1 |
27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 | 7d42eb43a64a4e3b20d721fc7148f685b53b6b30 | 1 |
6002 | 77 | Transcription_Factor_PPIs | 290 | 849f222220618e2599d925b6b51868cf1dab3763 | 1 | |
47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 | 7ebe772afb55b63b41b79dd8d06ea0fdd9fa2630 | 7 |
47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 | ad270a6876534b7cb063e004289dcd4d3164f342 | 7 |
21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 | 497787ebc418d308045efb63b8586f10c526af51 | 7 |
1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 | 4a293326037a5229aedb1ad7b2867283573d8bcd | 7 |
3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 | b343994a1b68483b0122b08650201c9b313d5c66 | 7 |
2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 | 5c307674c8b97e098f8399c92f451c0ff21cbf68 | 7 |
15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 | 248c4ed8ea28352795190214713c86a39fd7afab | 7 |
4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 | eb26f55d3904cb0ea471998b6a932a9bf65d8e50 | 7 |
34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 | 1 | |
7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 | f4029bf6a62c91ab29401348e51df23b8c44c90f | 7 |
16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 | 69c0cfe07d86f230a7ef01b365abcc7f6e52f138 | 2 |
12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 | f531ac2b6acdf7587a54b79b465a5f4aab8f00f9 | 7 |
23726 | 127 | GeneSigDB | https://pubmed.ncbi.nlm.nih.gov/22110038/ | 2139 | 6d655e0aa3408a7accb3311fbda9b108681a8486 | 4 |
32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 | 8dab0f96078977223646ff63eb6187e0813f1433 | 7 |
13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 | 0741451470203d7c40a06274442f25f74b345c9c | 5 |
19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 | 31191bfadded5f96983f93b2a113cf2110ff5ddb | 5 |
13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 | e1d004d5797cbd2363ef54b1c3b361adb68795c6 | 7 |
14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 | bf120b6e11242b1a64c80910d8e89f87e618e235 | 7 |
3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 | 17a138b0b70aa0e143fe63c14f82afb70bc3ed0a | 3 |
22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 | e1bc8a398e9b21f9675fb11bef18087eda21b1bf | 1 |
4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 | 462045609440fa1e628a75716b81a1baa5bd9145 | 7 |
10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 | 7d3566b12ebc23dd23d9ca9bb97650f826377b16 | 2 |
2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 | d047f6ead7831b00566d5da7a3b027ed9196e104 | 2 |
5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 | 54dcd9438b33301deb219866e162b0f9da7e63a0 | 2 |
10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 | c3bfc90796cfca8f60cba830642a728e23a53565 | 7 |
10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 | 0b09a9a1aa0af4fc7ea22d34a9ae644d45864bd6 | 7 |
11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 | 9041f90cccbc18479138330228b336265e09021c | 4 |
8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 | ebc0d905b3b3142f936d400c5f2a4ff926c81c37 | 4 |
1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 | cb2b92578a91e023d0498a334923ee84add34eca | 4 |
2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 | 27eca242904d8e12a38cf8881395bc50d57a03e1 | 4 |
851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 | 5abad1fc36216222b0420cadcd9be805a0dda63e | 4 |
10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 | e4cdcc7e259788fdf9b25586cce3403255637064 | 4 |
11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 | c76f5319c33c4833c71db86a30d7e33cd63ff8cf | 4 |
15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 | aabdf7017ae55ae75a004270924bcd336653b986 | 7 |
17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 | 45268b7fc680d05dd9a29743c2f2b2840a7620bf | 4 |
17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 | 5f531580ccd168ee4acc18b02c6bdf8200e19d08 | 4 |
15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 | eb38dbc3fb20adafa9d6f9f0b0e36f378e75284f | 5 |
12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 | 75c81676d8d6d99d262c9660edc024b78cfb07c9 | 5 |
13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 | 7 | |
6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 | 49351dc989f9e6ca97c55f8aca7778aa3bfb84b9 | 5 |
3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 | 1905132115d22e4119bce543bdacaab074edb363 | 6 |
7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 | e2b4912cfb799b70d87977808c54501544e4cdc9 | 6 |
7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 | 5216d1ade194ffa5a6c00f105e2b1899f64f45fe | 7 |
7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 | fd1332a42395e0bc1dba82868b39be7983a48cc5 | 7 |
8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 | 7e3e99e5aae02437f80b0697b197113ce3209ab0 | 7 |
13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 | 3804715a63a308570e47aa1a7877f01147ca6202 | 5 |
26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 | 56b6adb4dc8a2f540357ef992d6cd93dfa2907e5 | 1 |
29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 | 55b56cd8cf2ff04b26a09b9f92904008b82f3a6f | 1 |
280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 | d40701e21092b999f4720d1d2b644dd0257b6259 | 2 |
13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 | ea67371adec290599ddf484ced2658cfae259304 | 5 |
15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 | c209ae527bc8e98e4ccd27a668d36cd2c80b35b4 | 7 |
4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 | 98366496a75f163164106e72439fb2bf2f77de4e | 4 |
4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 | 83a710c1ff67fd6b8af0d80fa6148c40dbd9bc64 | 4 |
10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 | a4c6e217a81a4a58ff5a1c9fc102b70beab298e9 | 7 |
1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 | 70e4eb538daa7688691acfe5d9c3c19022be832b | 7 |
756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 | 711f0c02b23f5e02a01207174943cfeee9d3ea9c | 7 |
3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 | e80d25c56de53c704791ddfdc6ab5eec28ae7243 | 7 |
2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 | 47edfc012bcbb368a10b717d8dca103f7814b5a4 | 7 |
1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 | ab824aeeff0712bab61f372e43aebb870d1677a9 | 7 |
5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 | 1f7eea2f339f37856522c1f1c70ec74c7b25325f | 7 |
6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 | 36e541bee015eddb8d53827579549e30fe7a3286 | 7 |
25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 | a7acc741440264717ff77751a7e5fed723307835 | 5 |
19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 | 663b665b75a804ef98add689f838b68e612f0d2a | 6 |
23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 | 0f412e0802d76efa0374504c2c9f5e0624ff7f09 | 8 |
23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 | 9ddc3902fb01fb9eaf1a2a7c2ff3acacbb48d37e | 8 |
23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 | 068623a05ecef3e4a5e0b4f8db64bb8faa3c897f | 8 |
15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 | 76fc5ec6735130e287e62bae6770a3c5ee068645 | 6 |
24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 | c9c2155b5ac81ac496854fa61ba566dcae06cc80 | 8 |
3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 | 18a081774e6e0aaf60b1a4be7fd20afcf9e08399 | 2 |
31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 | 53dedc29ce3100930d68e506f941ef59de05dc6b | 8 |
30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 | 499882af09c62dd6da545c15cb51c1dc5e234f78 | 8 |
48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 | 712eb7b6edab04658df153605ec6079fa89fb5c7 | 7 |
5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 | 010f1267055b1a1cb036e560ea525911c007a666 | 4 |
9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 | 5e678b3debe8d8ea95187d0cd35c914017af5eb3 | 4 |
9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 | fedbf5e221f45ee60ebd944f92569b5eda7f2330 | 4 |
16725 | 1443 | GTEx_Tissue_Expression_Down | http://www.gtexportal.org/ | 2918 | 74b818bd299a9c42c1750ffe43616aa9f7929f02 | 5 |
19249 | 1443 | GTEx_Tissue_Expression_Up | http://www.gtexportal.org/ | 2918 | 103738763d89cae894bec9f145ac28167a90e611 | 5 |
15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 | 1eb3c0426140340527155fd0ef67029db2a72191 | 8 |
16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 | cd95fe1b505ba6f28cd722cfba50fdea979d3b4c | 8 |
15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 | 74c4f0a0447777005b2a5c00c9882a56dfc62d7c | 8 |
15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 | 31baa39da2931ddd5f7aedf2d0bbba77d2ba7b46 | 8 |
15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 | 555f68aef0a29a67b614a0d7e20b6303df9069c6 | 8 |
15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 | 1bc2ba607f1ff0dda44e2a15f32a2c04767da18c | 8 |
15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 | 9e613dba78ef7e60676b13493a9dc49ccd3c8b3f | 8 |
15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 | d0c3e2a68e8c611c669098df2c87b530cec3e132 | 8 |
3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 | 957846cb05ef31fc8514120516b73cc65af7980e | 4 |
3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 | 3bd494146c98d8189898a947f5ef5710f1b7c4b2 | 4 |
12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 | 1ccc5bce553e0c2279f8e3f4ddcfbabcf566623b | 2 |
12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 | b54a0d4ba525eac4055c7314ca9d9312adcb220c | 2 |
8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 | 1f54638e8f45075fb79489f0e0ef906594cb0678 | 2 |
7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 | 43f56da7540195ba3c94eb6e34c522a699b36da9 | 7 |
5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 | 340be98b444cad50bb974df69018fd598e23e5e1 | 7 |
15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | 5426f7747965c23ef32cff46fabf906e2cd76bfa | 1 | |
17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 | bb9682d78b8fc43be842455e076166fcd02cefc3 | 2 |
17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 | 78618915009cac3a0663d6f99d359e39a31b6660 | 2 |
1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 | 13d9ab18921d5314a5b2b366f6142b78ab0ff6aa | 2 |
934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 | d6a502ef9b4c789ed5e73ca5a8de372796e5c72a | 2 |
2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 | 3c1e1f7d1a651d9aaa198e73704030716fc09431 | 2 |
2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 | ca5f6abf7f75d9baae03396e84d07300bf1fd051 | 2 |
5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 | 255c3db820d612f34310f22a6985dad50e9fe1fe | 4 |
49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 | af271913344aa08e6a755af1d433ef15768d749a | 1 |
2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 | 249247d2f686d3eb4b9e4eb976c51159fac80a89 | 2 |
19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 | e8879ab9534794721614d78fe2883e9e564d7759 | 3 |
22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 | f0752e4d7f5198f86446678966b260c530d19d78 | 8 |
8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 | 0705e59bff98deda6e9cbe00cfcdd871c85e7d04 | 7 |
18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 | 56ec68c32d4e83edc2ee83bea0e9f6a3829b2279 | 3 |
15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 | 3045dff8181367c1421627bb8e4c5a32c6d67f98 | 3 |
10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 | b8620b1a9d0d271d1a2747d8cfc63589dba39991 | 2 |
10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 | 8fed21d22dfcc3015c05b31d942fdfc851cc8e04 | 7 |
10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 | b4018906e0a8b4e81a1b1afc51e0a2e7655403eb | 7 |
13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 | d9da4dba4a3eb84d4a28a3835c06dfbbe5811f92 | 7 |
8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 | ecf39c41fa5bc7deb625a2b5761a708676e9db7c | 7 |
10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 | 8d8340361dd36a458f1f0a401f1a3141de1f3200 | 7 |
13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 | 6404c38bffc2b3732de4e3fbe417b5043009fe34 | 7 |
21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 | 4126374338235650ab158ba2c61cd2e2383b70df | 5 |
23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 | 5496ef9c9ae9429184d0b9485c23ba468ee522a8 | 5 |
20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 | ce60be284fdd5a9fc6240a355421a9e12b1ee84a | 4 |
19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 | 6721c5ed97b7772e4a19fdc3f797110df0164b75 | 2 |
25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 | 8a468c3ae29fa68724f744cbef018f4f3b61c5ab | 1 |
19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 | 8 | |
14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 | 6b7c7fe2a97b19aecbfba12d8644af6875ad99c4 | 1 |
17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 | 79d13fb03d2fa6403f9be45c90eeda0f6822e269 | 1 |
5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 | e9b7d8ee237d0a690bd79d970a23a9fa849901ed | 6 |
12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 | be2ca8ef5a8c8e17d7e7bd290e7cbfe0951396c0 | 1 |
1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 | 17ce5192b9eba7d109b6d228772ea8ab222e01ef | 6 |
19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 | 287476538ab98337dbe727b3985a436feb6d192a | 4 |
14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 | b5b77681c46ac58cd050e60bcd4ad5041a9ab0a9 | 7 |
8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 | e9ebe46188efacbe1056d82987ff1c70218fa7ae | 7 |
11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 | 79ff80ae9a69dd00796e52569e41422466fa0bee | 7 |
19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 | 34d08a4878c19584aaf180377f2ea96faa6a6eb1 | 1 |
27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 | fdab39c467ba6b0fb0288df1176d7dfddd7196d5 | 6 |
13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 | 859b100fac3ca774ad84450b1fbb65a78fcc6b12 | 6 |
13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 | fc5bf033b932cf173633e783fc8c6228114211f8 | 6 |
13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 | 375ff8cdd64275a916fa24707a67968a910329bb | 4 |
13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 | 0f7fb7f347534779ecc6c87498e96b5460a8d652 | 4 |
16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 | f77de51aaf0979dd6f56381cf67ba399b4640d28 | 6 |
17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 | 25fa899b715cd6a9137f6656499f89cd25144029 | 6 |
10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 | 0fb9ac92dbe52024661c088f71a1134f00567a8b | 4 |
10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 | ee3adbac2da389959410260b280e7df1fd3730df | 4 |
12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 | b50bb9480d8a77103fb75b331fd9dd927246939a | 2 |
19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 | fef3864bcb5dd9e60cee27357eff30226116c49b | 4 |
6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 | b0c9e9ebb9014f14561e896008087725a2db24b7 | 7 |
4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 | e7750958da20f585c8b6d5bc4451a5a4305514ba | 7 |
3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 | 5f8cf93e193d2bcefa5a37ccdf0eefac576861b0 | 1 |
7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 | 3477bc578c4ea5d851dcb934fe2a41e9fd789bb4 | 7 |
8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 | 187eb44b2d6fa154ebf628eba1f18537f64e797c | 7 |
12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 | 18dd5ec520fdf589a93d6a7911289c205e1ddf22 | 6 |
9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 | a6325ed264f9ac9e6518796076c46a1d885cca7a | 6 |
7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 | 0b08b32b20854ac8a738458728a9ea50c2e04800 | 4 |
6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 | b7c4ead26d0eb64f1697c030d31682b581c8bb56 | 4 |
13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 | f1bed632e89ebc054da44236c4815cdce03ef5ee | 7 |
14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 | 958fb52e6215626673a5acf6e9289a1b84d11b4a | 4 |
9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 | e110851dfc763d30946f2abedcc2cd571ac357a0 | 2 |
1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 | 0a95303f8059bec08836ecfe02ce3da951150547 | 4 |
9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 | 6a7c7321b6b72c5285b722f7902d26a2611117cb | 4 |
17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 | 3c261626478ce9e6bf2c7f0a8014c5e901d43dc0 | 4 |
394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 | 47ba06cdc92469ac79400fc57acd84ba343ba616 | 2 |
11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 | 7094b097ae2301a1d6a5bd856a193b084cca993d | 5 |
8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 | 8c87c8346167bac2ba68195a32458aba9b1acfd1 | 5 |
18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 | 45b597d7efa5693b7e4172b09c0ed2dda3305582 | 1 |
5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 | a592eed13e8e9496aedbab63003b965574e46a65 | 2 |
5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 | 9196c760e3bcae9c9de1e3f87ad81f96bde24325 | 2 |
14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 | ad580f3864fa8ff69eaca11f6d2e7f9b86378d08 | 6 | |
16979 | 295 | COVID-19_Related_Gene_Sets | https://amp.pharm.mssm.edu/covid19 | 205 | 72b0346849570f66a77a6856722601e711596cb4 | 7 |
4383 | 146 | MSigDB_Hallmark_2020 | https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp | 50 | 6952efda94663d4bd8db09bf6eeb4e67d21ef58c | 2 |
54974 | 483 | Enrichr_Users_Contributed_Lists_2020 | https://maayanlab.cloud/Enrichr | 1482 | 8dc362703b38b30ac3b68b6401a9b20a58e7d3ef | 6 |
12118 | 448 | TG_GATES_2020 | https://toxico.nibiohn.go.jp/english/ | 1190 | 9e32560437b11b4628b00ccf3d584360f7f7daee | 4 |
12361 | 124 | Allen_Brain_Atlas_10x_scRNA_2021 | https://portal.brain-map.org/ | 766 | 46f8235cb585829331799a71aec3f7c082170219 | 5 |
9763 | 139 | Descartes_Cell_Types_and_Tissue_2021 | https://descartes.brotmanbaty.org/bbi/human-gene-expression-during-development/ | 172 | 5 | |
8078 | 102 | KEGG_2021_Human | https://www.kegg.jp/ | 320 | 2 | |
7173 | 43 | WikiPathway_2021_Human | https://www.wikipathways.org/ | 622 | 2 | |
5833 | 100 | HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression | https://hubmapconsortium.github.io/ccf-asct-reporter/ | 344 | 5 | |
14937 | 33 | GO_Biological_Process_2021 | http://www.geneontology.org/ | 6036 | 3 | |
11497 | 80 | GO_Cellular_Component_2021 | http://www.geneontology.org/ | 511 | 3 | |
11936 | 34 | GO_Molecular_Function_2021 | http://www.geneontology.org/ | 1274 | 3 | |
9767 | 33 | MGI_Mammalian_Phenotype_Level_4_2021 | http://www.informatics.jax.org/ | 4601 | 3 | |
14167 | 80 | CellMarker_Augmented_2021 | http://biocc.hrbmu.edu.cn/CellMarker/ | 1097 | 5 | |
17851 | 102 | Orphanet_Augmented_2021 | http://www.orphadata.org/ | 3774 | 4 | |
16853 | 360 | COVID-19_Related_Gene_Sets_2021 | https://maayanlab.cloud/covid19/ | 478 | 4 | |
6654 | 136 | PanglaoDB_Augmented_2021 | https://panglaodb.se/ | 178 | 5 | |
1683 | 10 | Azimuth_Cell_Types_2021 | https://azimuth.hubmapconsortium.org/ | 341 | 5 | |
20414 | 112 | PhenGenI_Association_2021 | https://www.ncbi.nlm.nih.gov/gap/phegeni | 950 | 4 | |
26076 | 250 | RNAseq_Automatic_GEO_Signatures_Human_Down | https://maayanlab.cloud/archs4/ | 4269 | 8 | |
26338 | 250 | RNAseq_Automatic_GEO_Signatures_Human_Up | https://maayanlab.cloud/archs4/ | 4269 | 8 | |
25381 | 250 | RNAseq_Automatic_GEO_Signatures_Mouse_Down | https://maayanlab.cloud/archs4/ | 4216 | 8 | |
25409 | 250 | RNAseq_Automatic_GEO_Signatures_Mouse_Up | https://maayanlab.cloud/archs4/ | 4216 | 8 | |
11980 | 250 | GTEx_Aging_Signatures_2021 | https://gtexportal.org/ | 270 | 4 | |
31158 | 805 | HDSigDB_Human_2021 | https://www.hdinhd.org/ | 2564 | 4 | |
30006 | 815 | HDSigDB_Mouse_2021 | https://www.hdinhd.org/ | 2579 | 4 |
=list() markers_enriched
if (markers_found) {
# Upregulated markers
# Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
= sapply(levels(sc$seurat_clusters), function(x) {
marker_genesets_up = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
tmp return(tmp[!is.na(tmp)])
USE.NAMES=TRUE, simplify=TRUE)
},
# Tests done by Enrichr
= purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
marker_genesets_up_enriched return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
file=file.path(param$path_out, "marker_degs", paste0("functions_marker_up_cluster_", n, "_vs_rest.xlsx")))
}))
# Downregulated markers
# Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
= sapply(levels(sc$seurat_clusters), function(x) {
marker_genesets_down = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
tmp return(tmp[!is.na(tmp)])
USE.NAMES=TRUE, simplify=TRUE)
},
# Tests done by Enrichr
= purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
marker_genesets_down_enriched return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
file=file.path(param$path_out, "marker_degs", paste0("functions_marker_down_cluster_", n, "_vs_rest.xlsx")))
}))
# Combine, flatten into data.frame and add to sc misc slot
= c(marker_genesets_up_enriched, marker_genesets_down_enriched)
marker_genesets_enriched = unname(marker_genesets_enriched)
marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
marker_genesets_enriched $Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
marker_genesets_enriched
= Misc(sc, "markers")
misc_content "enrichr"]] = marker_genesets_enriched
misc_content[[suppressWarnings({Misc(sc, "markers") = misc_content})
}
The following table contains the top enriched terms per cluster.
# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
cat("#### {.tabset} \n \n")
# Get top ten up and down over all databases per cluster
= marker_genesets_enriched %>% dplyr::group_by(Cluster, Direction) %>%
marker_genesets_top_enriched ::top_n(n=10, wt=Combined.Score)
dplyr
# Print as tabs
for(n in levels(marker_genesets_top_enriched$Cluster)){
cat("##### ", n, " \n")
print(knitr::kable(marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>% dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score),
align="l", caption="Top ten enriched terms per geneset", format="html") %>%
::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px"))
kableExtra
cat(" \n \n")
}cat(" \n \n")
}
Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
---|---|---|---|---|---|
GO_Biological_Process_2021 | purinergic nucleotide receptor signaling pathway (GO:0035590) | up | 0.0003995 | 30.74938 | 404.0514 |
GO_Biological_Process_2021 | positive regulation of tumor necrosis factor-mediated signaling pathway (GO:1903265) | up | 0.0065819 | 48.73892 | 456.0176 |
GO_Biological_Process_2021 | regulation of microglial cell mediated cytotoxicity (GO:1904149) | up | 0.0283057 | 64.67647 | 444.5816 |
GO_Biological_Process_2021 | regulation of neutrophil degranulation (GO:0043313) | up | 0.0283057 | 64.67647 | 444.5816 |
GO_Biological_Process_2021 | regulation of T-helper 1 cell cytokine production (GO:2000554) | up | 0.0283057 | 64.67647 | 444.5816 |
GO_Biological_Process_2021 | positive regulation of mast cell activation involved in immune response (GO:0033008) | up | 0.0283057 | 64.67647 | 444.5816 |
GO_Biological_Process_2021 | positive regulation of T-helper 1 cell cytokine production (GO:2000556) | up | 0.0283057 | 64.67647 | 444.5816 |
GO_Biological_Process_2021 | regulation of adiponectin secretion (GO:0070163) | up | 0.0351167 | 48.50490 | 314.0824 |
GO_Biological_Process_2021 | cellular heat acclimation (GO:0070370) | up | 0.0351167 | 48.50490 | 314.0824 |
GO_Biological_Process_2021 | heat acclimation (GO:0010286) | up | 0.0351167 | 48.50490 | 314.0824 |
GO_Biological_Process_2021 | negative regulation of sequestering of triglyceride (GO:0010891) | up | 0.0351167 | 48.50490 | 314.0824 |
GO_Biological_Process_2021 | positive regulation of T-helper 2 cell differentiation (GO:0045630) | up | 0.0351167 | 48.50490 | 314.0824 |
WikiPathway_2021_Human | Cori Cycle WP1946 | up | 0.0023029 | 30.13100 | 321.2547 |
GO_Biological_Process_2021 | T cell activation (GO:0042110) | down | 0.0001544 | 27.16416 | 413.9046 |
GO_Biological_Process_2021 | positive regulation of cellular component movement (GO:0051272) | down | 0.0003170 | 65.36505 | 903.6725 |
GO_Biological_Process_2021 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | down | 0.0011375 | 22.46380 | 271.0866 |
GO_Biological_Process_2021 | membrane raft distribution (GO:0031580) | down | 0.0056436 | 241.69697 | 2281.9781 |
GO_Biological_Process_2021 | regulation of lymphocyte activation (GO:0051249) | down | 0.0327276 | 55.74825 | 396.2818 |
GO_Biological_Process_2021 | regulation of lymphocyte differentiation (GO:0045619) | down | 0.0450371 | 40.25253 | 262.6275 |
WikiPathway_2021_Human | Cytoplasmic Ribosomal Proteins WP477 | down | 0.0004239 | 22.73237 | 275.5777 |
WikiPathway_2021_Human | Modulators of TCR signaling and T cell activation WP5072 | down | 0.0007318 | 26.33035 | 276.5917 |
WikiPathway_2021_Human | Pathogenesis of SARS-CoV-2 Mediated by nsp9-nsp10 Complex WP4884 | down | 0.0007318 | 61.49691 | 644.3744 |
WikiPathway_2021_Human | T-Cell Receptor and Co-stimulatory Signaling WP2583 | down | 0.0014836 | 42.55769 | 403.6040 |
Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
---|---|---|---|---|---|
GO_Biological_Process_2021 | T cell activation (GO:0042110) | up | 0.0000449 | 54.42529 | 870.8478 |
GO_Biological_Process_2021 | positive regulation of antigen receptor-mediated signaling pathway (GO:0050857) | up | 0.0005095 | 144.60870 | 1862.2655 |
GO_Biological_Process_2021 | positive regulation of cellular component movement (GO:0051272) | up | 0.0007431 | 108.42391 | 1311.3888 |
GO_Biological_Process_2021 | membrane raft distribution (GO:0031580) | up | 0.0016170 | 554.75000 | 6118.7964 |
GO_Biological_Process_2021 | positive regulation of T cell receptor signaling pathway (GO:0050862) | up | 0.0077590 | 138.62500 | 1223.8830 |
GO_Biological_Process_2021 | negative regulation of sequestering of calcium ion (GO:0051283) | up | 0.0185900 | 72.28623 | 552.5990 |
GO_Biological_Process_2021 | release of sequestered calcium ion into cytosol (GO:0051209) | up | 0.0185900 | 66.49667 | 498.0061 |
WikiPathway_2021_Human | Modulators of TCR signaling and T cell activation WP5072 | up | 0.0000579 | 63.53110 | 870.9017 |
WikiPathway_2021_Human | Pathogenesis of SARS-CoV-2 Mediated by nsp9-nsp10 Complex WP4884 | up | 0.0000664 | 144.60870 | 1862.2655 |
WikiPathway_2021_Human | Cancer immunotherapy by PD-1 blockade WP4585 | up | 0.0070076 | 79.17857 | 618.6531 |
GO_Biological_Process_2021 | regulation of microglial cell mediated cytotoxicity (GO:1904149) | down | 0.0194093 | 98.07407 | 753.7245 |
GO_Biological_Process_2021 | regulation of neutrophil degranulation (GO:0043313) | down | 0.0194093 | 98.07407 | 753.7245 |
GO_Biological_Process_2021 | positive regulation of mast cell activation involved in immune response (GO:0033008) | down | 0.0194093 | 98.07407 | 753.7245 |
GO_Biological_Process_2021 | regulation of adiponectin secretion (GO:0070163) | down | 0.0209666 | 73.55185 | 535.7732 |
GO_Biological_Process_2021 | cellular heat acclimation (GO:0070370) | down | 0.0209666 | 73.55185 | 535.7732 |
GO_Biological_Process_2021 | regulation of extracellular exosome assembly (GO:1903551) | down | 0.0209666 | 73.55185 | 535.7732 |
GO_Biological_Process_2021 | heat acclimation (GO:0010286) | down | 0.0209666 | 73.55185 | 535.7732 |
GO_Biological_Process_2021 | positive regulation of aspartic-type endopeptidase activity involved in amyloid precursor protein catabolic process (GO:1902961) | down | 0.0259134 | 58.83852 | 409.0642 |
GO_Biological_Process_2021 | regulation of vesicle size (GO:0097494) | down | 0.0259134 | 58.83852 | 409.0642 |
WikiPathway_2021_Human | Activation of NLRP3 Inflammasome by SARS-CoV-2 WP4876 | down | 0.0131241 | 58.83852 | 409.0642 |
WikiPathway_2021_Human | Nanomaterial-induced inflammasome activation WP3890 | down | 0.0131241 | 58.83852 | 409.0642 |
WikiPathway_2021_Human | Pentose Phosphate Metabolism WP134 | down | 0.0131241 | 58.83852 | 409.0642 |
Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
---|---|---|---|---|---|
GO_Biological_Process_2021 | regulation of B cell proliferation (GO:0030888) | up | 0.0106220 | 51.49096 | 516.2210 |
GO_Biological_Process_2021 | positive regulation of T cell activation (GO:0050870) | up | 0.0229421 | 30.70679 | 262.9199 |
GO_Biological_Process_2021 | regulation of lymphocyte proliferation (GO:0050670) | up | 0.0292862 | 83.83613 | 663.3672 |
GO_Biological_Process_2021 | regulation of B cell receptor signaling pathway (GO:0050855) | up | 0.0323766 | 67.85374 | 510.5767 |
GO_Biological_Process_2021 | regulation of B cell activation (GO:0050864) | up | 0.0385185 | 54.79121 | 390.5418 |
KEGG_2021_Human | Intestinal immune network for IgA production | up | 0.0028182 | 49.19753 | 486.9078 |
KEGG_2021_Human | Asthma | up | 0.0084172 | 49.11576 | 340.0515 |
KEGG_2021_Human | Allograft rejection | up | 0.0084172 | 39.55159 | 257.7446 |
KEGG_2021_Human | Primary immunodeficiency | up | 0.0084172 | 39.55159 | 257.7446 |
WikiPathway_2021_Human | Hematopoietic Stem Cell Differentiation WP2849 | up | 0.0037140 | 42.55983 | 403.7845 |
GO_Biological_Process_2021 | cellular response to interleukin-6 (GO:0071354) | down | 0.0153614 | 32.71068 | 285.6332 |
GO_Biological_Process_2021 | cellular response to granulocyte macrophage colony-stimulating factor stimulus (GO:0097011) | down | 0.0164897 | 134.59459 | 1138.7461 |
GO_Biological_Process_2021 | response to granulocyte macrophage colony-stimulating factor (GO:0097012) | down | 0.0164897 | 134.59459 | 1138.7461 |
GO_Biological_Process_2021 | positive regulation of tumor necrosis factor-mediated signaling pathway (GO:1903265) | down | 0.0289462 | 76.89961 | 583.8594 |
GO_Biological_Process_2021 | pentose-phosphate shunt (GO:0006098) | down | 0.0487938 | 48.92629 | 334.1250 |
WikiPathway_2021_Human | Pathogenesis of SARS-CoV-2 Mediated by nsp9-nsp10 Complex WP4884 | down | 0.0001689 | 65.05556 | 892.7569 |
WikiPathway_2021_Human | Metabolic reprogramming in colon cancer WP4290 | down | 0.0012161 | 29.07310 | 315.5824 |
WikiPathway_2021_Human | Cori Cycle WP1946 | down | 0.0013285 | 58.44423 | 600.4754 |
WikiPathway_2021_Human | STING pathway in Kawasaki-like disease and COVID-19 WP4961 | down | 0.0017493 | 43.05335 | 407.7392 |
WikiPathway_2021_Human | Pentose Phosphate Metabolism WP134 | down | 0.0050578 | 107.67027 | 874.9885 |
Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
---|---|---|---|---|---|
GO_Biological_Process_2021 | regulation of platelet-derived growth factor receptor signaling pathway (GO:0010640) | up | 0.0495681 | 95.44976 | 774.2638 |
GO_Biological_Process_2021 | positive regulation of T cell receptor signaling pathway (GO:0050862) | up | 0.0495681 | 87.49123 | 696.3301 |
GO_Biological_Process_2021 | regulation of receptor recycling (GO:0001919) | up | 0.0495681 | 69.98246 | 529.1272 |
GO_Biological_Process_2021 | positive regulation of antigen receptor-mediated signaling pathway (GO:0050857) | up | 0.0495681 | 55.23823 | 393.9292 |
KEGG_2021_Human | Th1 and Th2 cell differentiation | up | 0.0021142 | 25.09091 | 258.3472 |
KEGG_2021_Human | Primary immunodeficiency | up | 0.0021142 | 46.15830 | 449.0569 |
WikiPathway_2021_Human | Modulators of TCR signaling and T cell activation WP5072 | up | 0.0139321 | 27.82199 | 231.1936 |
WikiPathway_2021_Human | Pathogenesis of SARS-CoV-2 Mediated by nsp9-nsp10 Complex WP4884 | up | 0.0139321 | 55.23823 | 393.9292 |
WikiPathway_2021_Human | Cancer immunotherapy by PD-1 blockade WP4585 | up | 0.0139321 | 49.97243 | 347.1937 |
WikiPathway_2021_Human | Overview of leukocyte-intrinsic Hippo pathway functions WP4542 | up | 0.0258180 | 31.78150 | 194.1116 |
GO_Biological_Process_2021 | regulation of neutrophil degranulation (GO:0043313) | down | 0.0041162 | 282.95035 | 2757.6533 |
GO_Biological_Process_2021 | regulation of leukocyte degranulation (GO:0043300) | down | 0.0076953 | 121.24012 | 1027.0724 |
GO_Biological_Process_2021 | B cell homeostasis (GO:0001782) | down | 0.0082704 | 106.07979 | 875.1383 |
GO_Biological_Process_2021 | regulation of podosome assembly (GO:0071801) | down | 0.0082704 | 106.07979 | 875.1383 |
GO_Biological_Process_2021 | regulation of myeloid leukocyte mediated immunity (GO:0002886) | down | 0.0086372 | 94.28842 | 759.0883 |
GO_Biological_Process_2021 | regulation of B cell apoptotic process (GO:0002902) | down | 0.0086372 | 84.85532 | 667.8070 |
GO_Biological_Process_2021 | cellular response to thyroid hormone stimulus (GO:0097067) | down | 0.0086372 | 84.85532 | 667.8070 |
GO_Biological_Process_2021 | response to thyroid hormone (GO:0097066) | down | 0.0095121 | 77.13733 | 594.3014 |
GO_Biological_Process_2021 | regulation of homotypic cell-cell adhesion (GO:0034110) | down | 0.0095121 | 77.13733 | 594.3014 |
GO_Biological_Process_2021 | positive regulation of T-helper cell differentiation (GO:0045624) | down | 0.0107227 | 70.70567 | 533.9602 |
If requested, we identify genes that are differentially expressed between two groups of cells. Groups can be defined by columns in the cell metadata. Different types of tests can be used and input data for testing can be the different assays as well as the computed dimensionality reductions. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# We first compute the DEGs for all requested contrasts
# Prepare a list with contrasts (input can be R data.frame table or Excel file)
= DegsSetupContrastsList(sc, param$deg_contrasts, param$latent_vars)
degs_contrasts_list
# Add the actual data to the list
= purrr::map(degs_contrasts_list, function(contrast){
degs_contrasts_list # If there were already errors, just return
if (length(contrast[["error_messages"]]) > 0) return(c(contrast, list(object=NULL, cells_group1_idx_subset=as.integer(), cells_group2_idx_subset=as.integer())))
# Get cells indices
= contrast[["cells_group1_idx"]]
cells_group1_idx = contrast[["cells_group2_idx"]]
cells_group2_idx
# Create object
if (contrast[["use_reduction"]]) {
# Use dimensionality reduction
"object"]] = Seurat::Reductions(sc, slot=contrast[["assay"]])
contrast[[else {
} # Use assay
"object"]] = Seurat::GetAssay(sc[,unique(c(cells_group1_idx, cells_group2_idx))], assay=contrast[["assay"]])
contrast[[
# This saves a lot of memory for parallelisation
if (contrast[["slot"]]!="scale.data") contrast[["object"]]@scale.data = new(Class="matrix")
}
# Variable latent vars must be passed as data.frame
if (!is.null(contrast[["latent_vars"]]) && length(contrast[["latent_vars"]]) > 0) {
"latent_vars"]] = sc[[unique(c(cells_group1_idx, cells_group2_idx)), contrast[["latent_vars"]], drop=FALSE]]
contrast[[
}
# Now update cell indices so that they match to subset
"cells_group1_idx_subset"]] = match(colnames(sc)[cells_group1_idx], colnames(contrast[["object"]]))
contrast[["cells_group2_idx_subset"]] = match(colnames(sc)[cells_group2_idx], colnames(contrast[["object"]]))
contrast[[
return(contrast)
})
# Compute the tests
# TODO: this chunk may be done in parallel in future
= purrr::map(degs_contrasts_list, function(contrast) {
degs_contrasts_results if (length(contrast$error_messages)==0) {
# No errors, do contrast
= suppressWarnings(
test_results DegsTestCellSets(object=contrast[["object"]],
slot=contrast[["slot"]],
cells_1=colnames(contrast[["object"]])[contrast[["cells_group1_idx_subset"]]],
cells_2=colnames(contrast[["object"]])[contrast[["cells_group2_idx_subset"]]],
is_reduction=contrast[["use_reduction"]],
logfc.threshold=contrast[["log2FC"]],
test.use=contrast[["test"]],
min.pct=contrast[["min_pct"]],
latent.vars=contrast[["latent_vars"]])
)else {
} # Errors, return empty data.frame
= DegsEmptyResultsTable()
test_results
}
# Sort and filter table
= test_results %>% DegsSort() %>% DegsFilter(contrast[["log2FC"]], contrast[["padj"]], split_by_dir=FALSE)
test_results
# Add mean gene expression data (counts or data, dep on slot)
.1 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group1_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
avg.2 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group2_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
avg= cbind(test_results, avg.1, avg.2)
test_results
# Add test results and drop unneccessary data
= c(contrast, list(results=test_results))
contrast "object"]] = NULL
contrast[["cells_group1_idx_subset"]] = NULL
contrast[["cells_group2_idx_subset"]] = NULL
contrast[[
return(contrast)
})
# Also remove objects from deg_contrasts_list (save memory)
= purrr::map(degs_contrasts_list, function(contrast){ contrast[["object"]] = NULL; return(contrast)}) degs_contrasts_list
# Use the existing variable and add Enrichr results
# Not in parallel due to server load
= purrr::map(degs_contrasts_results, function(contrast) {
degs_contrasts_results # Get results table
= contrast$results
results_table
# Drop existing results
if ("enrichr" %in% names(contrast)) contrast[["enrichr"]] = NULL
# Split into up- and downregulated DEGs, then translate to Entrez gene, runEnrichr
= dplyr::filter(results_table, avg_log2FC > 0) %>% dplyr::pull(gene) %>% unique()
degs_up = sapply(degs_up, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_up = degs_up[!is.na(degs_up)]
degs_up = EnrichrTest(genes=degs_up, databases=param$enrichr_dbs, padj=param$enrichr_padj)
enrichr_results_up
= dplyr::filter(results_table, avg_log2FC < 0) %>% dplyr::pull(gene) %>% unique()
degs_down = sapply(degs_down, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_down = degs_down[!is.na(degs_down)]
degs_down = EnrichrTest(genes=degs_down, databases=param$enrichr_dbs, padj=param$enrichr_padj)
enrichr_results_down
# Flatten both enrichr results into tables
= purrr::map_dfr(names(enrichr_results_up), function(n) {
enrichr_results_up return(cbind(enrichr_results_up[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_up[[n]])), levels=names(enrichr_results_up)),
Direction=factor(rep("up", nrow(enrichr_results_up[[n]])), levels=c("up", "down"))
)
))
})
= purrr::map_dfr(names(enrichr_results_down), function(n) {
enrichr_results_down return(cbind(enrichr_results_down[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_down[[n]])), levels=names(enrichr_results_down)),
Direction=factor(rep("up", nrow(enrichr_results_down[[n]])), levels=c("up", "down"))
)
))
})
# Rbind and add factor levels
= rbind(enrichr_results_up, enrichr_results_down)
enrichr_results return(c(contrast, list(enrichr=enrichr_results)))
})
# Now regroup list so that subsets are together again
= purrr::map_int(degs_contrasts_results, function(contrast){ return(contrast[["contrast_row"]]) })
original_contrast_rows = split(degs_contrasts_results, original_contrast_rows)
degs
# Write degs to files
invisible(purrr::map_chr(degs, function(degs_subsets) {
# The output file
= file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_results.xlsx"))
file
# Write degs
= purrr::map(degs_subsets, function(contrast) {return(contrast[["results"]])})
degs_subsets_results names(degs_subsets_results) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
= DegsWriteToFile(degs_subsets_results,
file annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
file=file,
additional_readme=NULL)
return(file)
}))
invisible(purrr::map_chr(degs, function(degs_subsets) {
# The output file
= file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_functions.xlsx"))
file
# Write Enrichr results
= purrr::map(degs_subsets, function(contrast) {return(contrast[["enrichr"]])})
degs_subsets_enrichr names(degs_subsets_enrichr) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
= EnrichrWriteResults(degs_subsets_enrichr, file=file)
file
return(file)
}))
# Add to sc object
Misc(sc, "degs") = degs
for (i in seq(degs)) {
for (j in seq(degs[[i]])) {
# Average expression of all genes
= subset(sc, cells=degs[[i]][[j]]$cells_group2_idx) %>% GetAssayData(slot="data")
x = log2(Matrix::rowMeans(expm1(x)) + 1)
x = subset(sc, cells=degs[[i]][[j]]$cells_group1_idx) %>% GetAssayData(slot="data")
y = log2(Matrix::rowMeans(expm1(y)) + 1)
y = data.frame(x, y, col="none", gene=rownames(sc))
sc_avg_log2FC = c(min(c(x, y)), max(x, y))
lims
## Color DEGs
= degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC > 0) %>% dplyr::pull(gene)
up if (length(up) > 0) sc_avg_log2FC[up, "col"] = "up"
= degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC < 0) %>% dplyr::pull(gene)
down if (length(down) > 0) sc_avg_log2FC[down, "col"] = "down"
## Plots
$plot_scatter = ggplot(sc_avg_log2FC %>% dplyr::arrange(col, gene), aes(x=x, y=y, col=col)) +
degs[[i]][[j]]geom_abline(slope=1, intercept=0, col="lightgrey") +
geom_abline(slope=1, intercept=c(-degs[[i]][[j]]$log2FC, degs[[i]][[j]]$log2FC), col="lightgrey", lty=2) +
geom_point() +
xlim(lims) + ylim(lims) +
AddStyle(ylab=degs[[i]][[j]]$condition_group1, xlab=degs[[i]][[j]]$condition_group2,
col=c(none="grey", up="darkgoldenrod1", down="steelblue"),
legend_position="bottom", legend_title="Filtered genes")
# Feature plot of top 4 genes, sorted by the p-value
= degs[[i]][[j]]$results %>% dplyr::top_n(n=-4, wt=p_val) %>% dplyr::top_n(n=-4, wt=avg_log2FC) %>% dplyr::pull(gene)
degs_top if (length(degs_top) > 0) {
= Seurat::FeaturePlot(sc, features=degs_top, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
p_list for (p in seq(p_list)) p_list[[p]] = p_list[[p]] + AddStyle(legend_position="bottom", xlab="", ylab="")
$plot_feature = patchwork::wrap_plots(p_list, ncol=ifelse(length(degs_top) > 1, 2, 1))
degs[[i]][[j]]else {
} $plot_feature = NULL
degs[[i]][[j]]
}
} }
= "
knitr_header_string
## {{condition_column}}: {{condition_group1}} vs {{condition_group2}}
General configuration:
* assay: {{assay}}
* slot: {{slot}}
* test: {{test}}
* maximum adjusted p-value: {{padj}}
* minimum absolute log2 foldchange: {{log2FC}}
* minimum percentage of cells: {{min_pct}}
* latent vars: {{latent_vars}}
Subset on column: \'{{subset_column}}\'"
if (length(degs)==0) message("No DEG contrasts specified.")
× (Message)
No DEG contrasts specified.
for (i in seq(degs)) {
# Get subsets
= degs[[i]]
degs_subsets = degs_subsets[[1]]
first_contrast
# Create header
cat(
::knit_expand(text=knitr_header_string,
knitrcondition_column=first_contrast[["condition_column"]],
condition_group1=first_contrast[["condition_group1"]],
condition_group2=first_contrast[["condition_group2"]],
assay=first_contrast[["assay"]],
slot=first_contrast[["slot"]],
test=first_contrast[["test"]],
padj=first_contrast[["padj"]],
log2FC=first_contrast[["log2FC"]],
min_pct=first_contrast[["min_pct"]],
latent_vars=ifelse(!is.null(first_contrast[["latent_vars"]]), paste(colnames(first_contrast[["latent_vars"]]), collapse=","), "-"),
subset_column=ifelse(is.na(first_contrast[["subset_column"]]), "-", first_contrast[["subset_column"]]))
"\n")
,
# Get error messages
= unique(purrr::flatten_chr(purrr::map(degs_subsets, function(contrast){return(contrast[["error_messages"]])})))
error_messages
# Create combined results table
= purrr::map_dfr(degs_subsets, function(contrast) {
degs_subsets_results = ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All")
subset_group_value return(contrast[["results"]] %>%
::summarise(subset_group=subset_group_value,
dplyrCells1=length(contrast[["cells_group1_idx"]]),
Cells2=length(contrast[["cells_group2_idx"]]),
DEGs=length(gene),
DEGs_up=sum(avg_log2FC > 0),
DEGs_down=sum(avg_log2FC < 0)))
%>% tibble::as_tibble()
})
# Print warnings/errors
if (length(error_messages) > 0) {
warning(error_messages)
}
# Print summary table
print(
::kable(degs_subsets_results,
knitralign="l", caption="DEG summary",
col.names=c("Subset", "Cells in group 1", "Cells in group 2", "# DEGs", "# DEGs upregulated", "# DEGs downregulated"),
format="html") %>%
::kable_styling(bootstrap_options=c("striped", "hover"))
kableExtra
)
# Print plots per contrast
for (contrast in seq(degs_subsets)) {
= degs_subsets[[contrast]]$plot_scatter | degs_subsets[[contrast]]$plot_feature
p = "Scatterplot and feature plots"
title if (!is.na(degs_subsets[[contrast]]$subset_column)) {
= paste0(title, " (subset ", degs_subsets[[contrast]]$subset_column,
title ": ", degs_subsets[[contrast]]$subset_group, ")")
}
= p + patchwork::plot_annotation(title=title)
p print(p)
}
cat("\n \n")
}
# Add colour lists for orig.dataset
= GenerateColours(num_colours=length(levels(sc$orig.dataset)), names=levels(sc$orig.dataset), palette=param$col_palette_samples, alphas=1)
col = ScAddLists(sc, lists=list(orig.dataset=col), lists_slot="colour_lists")
sc
# Add experiment details
::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))
Seurat
# Add parameter
::Misc(sc, "parameters") = param
Seurat
# Add technical output
= suppressMessages(sessioninfo::session_info())
session_info ::Misc(sc, "technical") = list(scrnaseq_git=GitRepositoryVersion(param$path_to_git),
SeuratR=session_info$platform$version,
packages=as.data.frame(session_info$packages)[, c("package", "ondiskversion", "loadedversion", "date")])
If all provided datasets are of type “10x,” we export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (Unknown 2022d).
if (all(param$path_data$type == "10x")) {
# Export reductions (umap, pca, others)
for(r in Seurat::Reductions(sc)) {
write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
file=file.path(param$path_out, "export", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
# Export categorical metadata
= sc@meta.data
loupe_meta = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
idx_keep write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"),
file=file.path(param$path_out, "export", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets (CC genes, known markers, per-cluster markers up- and downregulated, ...)
= Misc(sc, "gene_lists")
gene_lists
# Remove empty gene sets
= gene_lists[purrr::map_int(gene_lists, length) > 0]
gene_lists = purrr::map_dfr(names(gene_lists), function(n) {return(data.frame(List=n, Name=gene_lists[[n]]))})
loupe_genesets $Ensembl = seurat_rowname_to_ensembl[loupe_genesets$Name]
loupe_genesetswrite.table(loupe_genesets, file=file.path(param$path_out, "export", "Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
Result files are:
Projections can be imported in Loupe via “Import Projection,” cell meta data via “Import Categories” and gene sets via “Import Lists”
We export the assay data, cell metadata, clustering and visualisation in a format that can be read by the cellxgene browser (Unknown 2022e).
# Convert Seurat single cell object to python anndata object which will be accessible via reticulate here
= sceasy::convertFormat(sc, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))
adata
# Set up correct colours (see https://chanzuckerberg.github.io/cellxgene/posts/prepare) so that colours match
$uns = dict(
adataseurat_clusters_colors=np_array(unname(Misc(sc, "colour_lists")$seurat_clusters), dtype="<U7"),
orig.ident_colors=np_array(unname(Misc(sc, "colour_lists")$orig.ident), dtype="<U7"),
orig.dataset_colors=np_array(unname(Misc(sc, "colour_lists")$orig.dataset), dtype="<U7"),
Phase_colors=np_array(unname(Misc(sc, "colour_lists")$Phase), dtype="<U7")
)
# Write to h5ad file
$write(file.path(param$path_out, "export", "sc.h5ad"), compression="gzip") adata
Result files are:
Copy/upload to data directory of your cellxgene browser
We export the assay data, clustering, visualisation, marker genes, enriched pathways and degs in a format that can be read by the Cerebro Browser (Unknown 2022f).
# Export to cerebro
= ExportToCerebro(sc=sc,
res path=file.path(param$path_out, "export", "sc.crb"),
assay=DefaultAssay(sc),
assay_raw=param$assay_raw,
delayed_array=FALSE)
Result files are:
Load into Cerebro browser.
# Seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
# Counts (RNA)
invisible(ExportSeuratAssayData(sc,
dir=file.path(param$path_out, "data", "counts"),
assays=param$assay_raw,
slot="counts",
include_cell_metadata_cols=colnames(sc[[]]),
metadata_prefix=paste0(param$project_id, ".")))
# Metadata
::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), overwrite = TRUE, row.names=FALSE, col.names=TRUE) openxlsx
× (Warning) Please use ‘rowNames’ instead of ‘row.names’
× (Warning) Please use ‘colNames’ instead of ‘col.names’
# Annotation as excel file
::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)]) %>%
openxlsx::inner_join(annot_ensembl, by="ensembl_gene_id"),
dplyrfile=file.path(param$path_out, "annotation", "gene_annotation.xlsx"), overwrite = TRUE, row.names=FALSE, col.names=TRUE)
× (Warning) Please use ‘rowNames’ instead of ‘row.names’
Please use ‘colNames’ instead of ‘col.names’
# Data and annotation for a subset of 500 cells for Morpheus
= ScSampleCells(sc, n=500, seed=1)
cells_subset ::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"), file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_normalised_data.xlsx")), overwrite = TRUE, row.names=FALSE, col.names=TRUE) openxlsx
× (Warning) Please use ‘rowNames’ instead of ‘row.names’
Please use ‘colNames’ instead of ‘col.names’
::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="scale.data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"), file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_scaled_data.xlsx")), overwrite = TRUE, row.names=FALSE, col.names=TRUE) openxlsx
× (Warning) Please use ‘rowNames’ instead of ‘row.names’
Please use ‘colNames’ instead of ‘col.names’
::write.xlsx(sc[[]][cells_subset, ] %>% as.data.frame() %>% tibble::rownames_to_column(var="cell"), file=file.path(param$path_out, "data",paste0("subset_", 500, "_cells_column_annotations.xlsx")), overwrite = TRUE, row.names=FALSE, col.names=TRUE) openxlsx
× (Warning) Please use ‘rowNames’ instead of ‘row.names’
Please use ‘colNames’ instead of ‘col.names’
Result files are:
The following parameters were used to run the workflow.
= ScrnaseqParamsInfo(params=param)
out
::kable(out, align="l") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left") kableExtra
Name | Value |
---|---|
project_id | pbmc_small |
path_data | name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample1/, /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/counts/sample2/; stats:NA, NA; suffix:-1, -2 |
assay_raw | RNA |
path_out | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results/ |
mart_dataset | hsapiens_gene_ensembl |
annot_version | 98 |
annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
mt | ^MT- |
cell_filter | sample1:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15); sample2:nFeature_RNA=c(200, 8000), percent_mt=c(NA, 15) |
feature_filter | sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5 |
samples_min_cells | 10 |
norm | RNA |
cc_remove | FALSE |
cc_remove_all | FALSE |
cc_rescore_after_merge | FALSE |
integrate_samples | method:merge |
pc_n | 10 |
cluster_resolution | 0.8 |
cluster_resolution_test | 0.4, 0.6, 1.2 |
marker_padj | 0.05 |
marker_log2FC | 1 |
marker_pct | 0.25 |
enrichr_site | Enrichr |
enrichr_padj | 0.05 |
enrichr_dbs | GO_Biological_Process_2021, KEGG_2021_Human, WikiPathway_2021_Human |
col | palevioletred |
col_palette_samples | ggsci::pal_rickandmorty |
col_palette_clusters | ggsci::pal_igv |
pt_size | 1 |
path_to_git | . |
debugging_mode | default_debugging |
file_annot | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/hsapiens_gene_ensembl.v98.annot.txt |
file_cc_genes | /mnt/ngsnfs/single_cell_dev/users/kosankem/Scripts/testset/results//annotation/cell_cycle_markers.xlsx |
downsample_cells_n | |
biomart_mirror | www |
samples_to_drop | |
col_samples | sample1=#FAFD7CFF, sample2=#82491EFF |
col_clusters | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF |
This report was created with generated using the scrnaseq GitHub repository. Software versions were collected at run time.
= ScrnaseqSessionInfo(param$path_to_git)
out
::kable(out, align="l") %>%
knitr::kable_styling(bootstrap_options=c("striped", "hover")) kableExtra
Name | Version |
---|---|
ktrns/scrnaseq | Unknown |
R | R version 4.1.2 (2021-11-01) |
Platform | x86_64-pc-linux-gnu (64-bit) |
Operating system | Ubuntu 20.04.3 LTS |
Packages | abind1.4-5, annotate1.70.0, AnnotationDbi1.54.1, ape5.5, assertthat0.2.1, babelgene21.4, beachmat2.8.1, bibtex0.4.2.3, Biobase2.52.0, BiocFileCache2.0.0, BiocGenerics0.38.0, BiocParallel1.26.2, BiocSingular1.8.1, biomaRt2.48.3, Biostrings2.60.2, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.2, bslib0.3.0, cachem1.0.6, cerebroApp1.3.1, cli3.0.1, cluster2.1.2, codetools0.2-18, colorspace2.0-2, colourpicker1.1.0, cowplot1.1.1, crayon1.4.1, curl4.3.2, data.table1.14.2, DBI1.1.1, dbplyr2.1.1, DelayedArray0.18.0, DelayedMatrixStats1.14.3, deldir0.2-10, digest0.6.28, dplyr1.0.7, DT0.19, ellipsis0.3.2, enrichR3.0, evaluate0.14, fansi0.5.0, farver2.1.0, fastmap1.1.0, filelock1.0.2, fitdistrplus1.1-6, fs1.5.0, future1.22.1, future.apply1.8.1, generics0.1.0, GenomeInfoDb1.28.4, GenomeInfoDbData1.2.6, GenomicRanges1.44.0, ggplot23.3.5, ggrepel0.9.1, ggridges0.5.3, ggsci2.9, globals0.14.0, glue1.4.2, goftest1.2-2, graph1.70.0, gridExtra2.3, GSEABase1.54.0, GSVA1.40.1, gtable0.3.0, HDF5Array1.20.0, highr0.9, hms1.1.1, htmltools0.5.2, htmlwidgets1.5.4, httpuv1.6.3, httr1.4.2, ica1.0-2, igraph1.2.6, IRanges2.26.0, irlba2.3.3, jquerylib0.1.4, jsonlite1.7.2, kableExtra1.3.4, KEGGREST1.32.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.36, labeling0.4.2, later1.3.0, lattice0.20-45, lazyeval0.2.2, leiden0.3.9, lifecycle1.0.1, listenv0.8.0, lmtest0.9-38, lubridate1.7.10, magrittr2.0.1, MASS7.3-54, MAST1.18.0, Matrix1.4-0, MatrixGenerics1.4.3, matrixStats0.61.0, memoise2.0.0, mgcv1.8-38, mime0.12, miniUI0.1.1.1, msigdbr7.4.1, munsell0.5.0, nlme3.1-152, openxlsx4.2.4, parallelly1.28.1, patchwork1.1.1, pbapply1.5-0, pillar1.6.3, pkgconfig2.0.3, plotly4.9.4.1, plyr1.8.6, png0.1-7, polyclip1.10-0, prettyunits1.1.1, progress1.2.2, promises1.2.0.1, purrr0.3.4, qvalue2.24.0, R.methodsS31.8.1, R.oo1.24.0, R.utils2.11.0, R62.5.1, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-2, Rcpp1.0.7, RcppAnnoy0.0.19, RCurl1.98-1.5, readr2.0.2, RefManageR1.3.0, reshape21.4.4, reticulate1.22, rhdf52.36.0, rhdf5filters1.4.0, Rhdf5lib1.14.2, rjson0.2.20, rlang0.4.11, rmarkdown2.11, ROCR1.0-11, rpart4.1-15, RSpectra0.16-0, RSQLite2.2.8, rstudioapi0.13, rsvd1.0.5, Rtsne0.15, rvest1.0.1, S4Vectors0.30.1, sass0.4.0, ScaledMatrix1.0.0, scales1.1.1, scattermore0.7, sceasy0.0.6, sctransform0.3.2, sessioninfo1.1.1, Seurat4.0.4, SeuratObject4.0.2, shiny1.7.0, shinycssloaders1.0.0, shinydashboard0.7.1, shinyFiles0.9.0, shinyjs2.0.0, shinyWidgets0.6.2, SingleCellExperiment1.14.1, sparseMatrixStats1.4.2, spatstat.core2.3-0, spatstat.data2.1-0, spatstat.geom2.2-2, spatstat.sparse2.0-0, spatstat.utils2.2-0, stringi1.7.4, stringr1.4.0, SummarizedExperiment1.22.0, survival3.2-13, svglite2.0.0, systemfonts1.0.2, tensor1.5, tibble3.1.4, tidyr1.1.4, tidyselect1.1.1, tzdb0.1.2, utf81.2.2, uwot0.1.10, vctrs0.3.8, viridis0.6.1, viridisLite0.4.0, webshot0.5.2, withr2.4.2, xfun0.26, XML3.99-0.8, xml21.3.2, xtable1.8-4, XVector0.32.0, yaml2.2.1, zip2.2.0, zlibbioc1.38.0, zoo1.8-9 |
# Writes knitcitations references to references.bib file.
::write.bibtex(file="references.bib") knitcitations