軟件介紹
SENIC是一種同時重建基因調控網絡並從單細胞RNA-seq數據中鑒定stable cell states的工具。基於共表達和DNA模基序 (motif)分析推斷基因調控網絡 ,然后在每個細胞中分析網絡活性以鑒定細胞狀態
https://www.nature.com/articles/nmeth.4463
參考幫助文檔:https://rawcdn.githack.com/aertslab/SCENIC/0a4c96ed8d930edd8868f07428090f9dae264705/inst/doc/SCENIC_Running.html#optional_steps:
輸入:SCENIC需要輸入的是單細胞RNA-seq表達矩陣—— 每列對應於樣品(細胞),每行對應一個基因。基因ID應該是gene-symbol並存儲為rownames (尤其是基因名字部分是為了與RcisTarget數據庫兼容);表達數據是Gene的reads count。根據作者的測試,提供原始的或Normalized UMI count,無論是否log轉換,或使用TPM值,結果相差不大。
軟件的安裝
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install(c("AUCell", "RcisTarget"))
BiocManager::install(c("GENIE3"))
BiocManager::install(c("zoo", "mixtools", "rbokeh"))
BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
BiocManager::install(c("doMC", "doRNG"))
BiocManager::install(c("SingleCellExperiment"))
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("aertslab/SCENIC")
packageVersion("SCENIC")
下載評分數據庫
需要下載RcisTarget的物種特定數據庫(https://resources.aertslab.org/cistarget/)
For Human,Mouse,Fly
mm_url="https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-500bp-upstream-7species.mc9nr.feather"
mm_url2="https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-10kb-7species.mc9nr.feather"
hg_url="https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-500bp-upstream-7species.mc9nr.feather"
hg_url2="https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather"
fly_url="https://resources.aertslab.org/cistarget/databases/drosophila_melanogaster/dm6/flybase_r6.02/mc8nr/gene_based/dm6-5kb-upstream-full-tx-11species.mc8nr.feather"
wget -c $mm_url
wget -c $mm_url2
wget -c $hg_url
wget -c $hg_url2
wget -c $fly_url
不同數據格式的讀入
對於loom文件
download.file("http://loom.linnarssonlab.org/clone/Previously%20Published/Cortex.loom", "Cortex.loom")
loomPath <- "Cortex.loom"
10x的輸出文件
singleCellMatrix <- Seurat::Read10X(data.dir="data/pbmc3k/filtered_gene_bc_matrices/hg19/")
cellInfo <- data.frame(seuratCluster=Idents(seuratObject))
R objects (e.g. Seurat, SingleCellExperiment)
sce <- load_as_sce(dataPath) # any SingleCellExperiment object
exprMat <- counts(sce)
cellInfo <- colData(sce)
簡單的SENIC工作流程
setwd("/media/sdb/project/20200223/SCENIC_MouseBrain")
loomPath <- system.file(package="SCENIC", "examples/mouseBrain_toy.loom")
library(SCopeLoomR)
loom <- open_loom(loomPath)
exprMat <- get_dgem(loom)
cellInfo <- get_cellAnnotation(loom)
close_loom(loom)
#查看矩陣大小
#dim(exprMat)
library(SCENIC)
#scenicOptions <- initializeScenic(org="mgi", dbDir="cisTarget_databases", nCores=10)
scenicOptions <- initializeScenic(org="mgi", dbDir="/media/sdb/project/20200223/data", nCores=10)
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
### Co-expression network
genesKept <- geneFiltering(exprMat, scenicOptions)
exprMat_filtered <- exprMat[genesKept, ]
runCorrelation(exprMat_filtered, scenicOptions)
exprMat_filtered_log <- log2(exprMat_filtered+1)
runGenie3(exprMat_filtered_log, scenicOptions)
### Build and score the GRN
exprMat_log <- log2(exprMat+1)
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # Toy run settings
runSCENIC_1_coexNetwork2modules(scenicOptions)
runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) # Toy run settings
runSCENIC_3_scoreCells(scenicOptions, exprMat_log)
# Export: 運行這個時可能報錯
#export2scope(scenicOptions, exprMat)
# Binarize activity?
# aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat_log)
# savedSelections <- shiny::runApp(aucellApp)
# newThresholds <- savedSelections$thresholds
# scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
# saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
# saveRDS(scenicOptions, file="int/scenicOptions.Rds")
runSCENIC_4_aucell_binarize(scenicOptions)
### Exploring output
# Check files in folder 'output'
# .loom file @ http://scope.aertslab.org
# output/Step2_MotifEnrichment_preview.html in detail/subset:
motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")
tableSubset <- motifEnrichment_selfMotifs_wGenes[highlightedTFs=="Sox8"]
viewMotifs(tableSubset)
# output/Step2_regulonTargetsInfo.tsv in detail:
regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
tableSubset <- regulonTargetsInfo[TF=="Stat6" & highConfAnnot==TRUE]
viewMotifs(tableSubset)
運行SENIC
建立基因調控網絡(Gene Regulation Network,GRN):
- 基於共表達識別每個轉錄因子TF的潛在靶標。過濾表達矩陣並運行GENIE3或者GRNBoost,它們是利用表達矩陣推斷基因調控網絡的一種算法,能得到轉錄因子和潛在靶標的相關性網絡;將目標從GENIE3或者GRNBoost格式轉為共表達模塊。
- 根據DNA模序分析(
motif)選擇潛在的直接結合靶標(調節因子)(利用RcisTarget包:TF基序分析)
確定細胞狀態及其調節因子:
3. 分析每個細胞中的網絡活性(AUCell) 在細胞中評分調節子(計算AUC)
SCENIC完整流程
導入數據
loomPath <- system.file(package="SCENIC", "examples/mouseBrain_toy.loom")
library(SCopeLoomR)
loom <- open_loom(loomPath) #mode='r' 如果報錯可以加上
exprMat <- get_dgem(loom)
cellInfo <- get_cellAnnotation(loom)
close_loom(loom)
Initialize settings 初始設置,導入評分數據庫
library(SCENIC)
#先下載數據庫,org用來選擇物種,這里選擇的是小鼠
scenicOptions <- initializeScenic(org="mgi", dbDir="cisTarget_databases", nCores=10)
# scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
共表達網絡
根據已有的表達矩陣推斷潛在的轉錄因子靶標,使用GENIE3或GRNBoost。首先需要進行基因的過濾。
genesKept <- geneFiltering(exprMat, scenicOptions=scenicOptions,
minCountsPerGene=3*.01*ncol(exprMat),
minSamples=ncol(exprMat)*.01)
過濾表達矩陣,保留只有過濾后的基因
exprMat_filtered <- exprMat[genesKept, ]
計算相關性,這一步時間會比較長
runCorrelation(exprMat_filtered, scenicOptions)
exprMat_filtered_log <- log2(exprMat_filtered+1)
runGenie3(exprMat_filtered_log, scenicOptions)
Build and score the GRN 構建並給基因調控網絡(GRN)打分
exprMat_log <- log2(exprMat+1)
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # Toy run settings
runSCENIC_1_coexNetwork2modules(scenicOptions)
runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) # Toy run settings
runSCENIC_3_scoreCells(scenicOptions, exprMat_log)
輸入表達矩陣
在本教程中,我們提供了一個示例,樣本是小鼠大腦的200個細胞和862個基因:
loomPath <- system.file(package="SCENIC", "examples/mouseBrain_toy.loom")
打開loom文件並加載表達矩陣;
library(SCopeLoomR)
loom <- open_loom(loomPath, mode="r")
exprMat <- get_dgem(loom)
cellInfo <- get_cellAnnotation(loom)
close_loom(loom)
#dim(exprMat)

細胞信息/表型
# cellInfo$nGene <- colSums(exprMat>0)
head(cellInfo)

cellInfo <- data.frame(cellInfo)
cellTypeColumn <- "Class"
colnames(cellInfo)[which(colnames(cellInfo)==cellTypeColumn)] <- "CellType"
cbind(table(cellInfo$CellType))

saveRDS(cellInfo, file="int/cellInfo.Rds")
# Color to assign to the variables (same format as for NMF::aheatmap)
colVars <- list(CellType=c("microglia"="forestgreen",
"endothelial-mural"="darkorange",
"astrocytes_ependymal"="magenta4",
"oligodendrocytes"="hotpink",
"interneurons"="red3",
"pyramidal CA1"="skyblue",
"pyramida SS"="darkblue"
))
colVars$CellType <- colVars$CellType[intersect(names(colVars$CellType), cellInfo$CellType)]
saveRDS(colVars, file="int/colVars.Rds")
plot.new()
legend(0,1, fill=colVars$CellType, legend=names(colVars$CellType))

初始化SCENIC設置
為了在SCENIC的多個步驟中保持設置一致,SCENIC包中的大多數函數使用一個公共對象,該對象存儲當前運行的選項並代替大多數函數的“參數”。比如下面的org,dbDir等,可以在開始就將物種rog(mgi—— mouse, hgnc —— human, dmel —— fly)和RcisTarge數據庫位置分別讀給對象org,dbDir,之后統一用函數initializeScenic得到對象scenicOptions。具體參數設置可以用?initializeScenichelp一下。
library(SCENIC)
org="mgi" # or hgnc, or
dmeldbDir="cisTarget_databases" # RcisTarget databases location
myDatasetTitle="SCENIC example on Mouse brain" # choose a name for your analysis
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10)

# 如果有需要就修改這個地方
scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds
"scenicOptions@inputDatasetInfo$colVars <- "int/colVars.Rds"
# Databases:
# scenicOptions@settings$dbs <- c("mm9-5kb-mc8nr"="mm9-tss-centered-5kb-10species.mc8nr.feather")
# scenicOptions@settings$db_mcVersion <- "v8"
# Save to use at a later time...
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
共表達網絡
SCENIC工作流程的第一步是根據表達數據推斷潛在的轉錄因子靶標。為此,我們使用GENIE3或GRNBoost,輸入文件是表達矩陣(過濾后的)和轉錄因子列表。GENIE3/GRBBoost的輸出結果和相關矩陣將用於創建共表達模塊(runSCENIC_1_coexNetwork2modules())。
基因過濾/選擇
- 按每個基因的reads總數進行過濾。該filter旨在去除最可能是噪音的基因。默認情況下,它(
minCountsPerGene)保留所有樣品中至少帶有6個UMI reads的基因(例如,如果在1%的細胞中以3的值表達,則基因將具有的總數)。 - 通過基因的細胞數來實現過濾(例如 UMI > 0 ,或log 2(TPM)> 1 )。默認情況下(
minSamples),保留下來的基因能在至少1%的細胞中檢測得到。 - 最后,只保留
RcisTarget數據庫中可用的基因。
# (Adjust minimum values according to your dataset)
genesKept <- geneFiltering(exprMat, scenicOptions=scenicOptions,
minCountsPerGene=3*.01*ncol(exprMat),
minSamples=ncol(exprMat)*.01)

在進行網絡推斷之前,檢查是否有任何已知的相關基因被過濾掉(如果缺少任何相關基因,請仔細檢查filter設置是否合適):
interestingGenes <- c("Sox9", "Sox10", "Dlx5")
interestingGenes[which(!interestingGenes %in% genesKept)]
相關性
GENIE33或者GRNBoost可以檢測正負關聯。為了區分潛在的激活和抑制,我們將目標分為正相關和負相關目標(比如TF與潛在目標之間的Spearman相關性)。
runCorrelation(exprMat_filtered, scenicOptions)
運行GENIE3得到潛在轉錄因子TF
## If launched in a new session, you will need to reload...
# setwd("...")
# loomPath <- "..."
# loom <- open_loom(loomPath, mode="r")
# exprMat <- get_dgem(loom)
# close_loom(loom)
# genesKept <- loadInt(scenicOptions, "genesKept")
# exprMat_filtered <- exprMat[genesKept,]
# library(SCENIC)
# scenicOptions <- readRDS("int/scenicOptions.Rds")
# Optional: add log (if it is not logged/normalized already)
exprMat_filtered <- log2(exprMat_filtered+1)
# Run GENIE3
runGenie3(exprMat_filtered, scenicOptions)
構建並評分GRN(runSCENIC_ …)
必要時重新加載表達式矩陣:
loom <- open_loom(loomPath, mode="r")
exprMat <- get_dgem(loom)
close_loom(loom)
# Optional: log expression (for TF expression plot, it does not affect any other calculation)
logMat <- log2(exprMat+1)
dim(exprMat)
使用wrapper函數運行其余步驟:
library(SCENIC)
scenicOptions <- readRDS("int/scenicOptions.Rds")
scenicOptions@settings$verbose <- TRUE
scenicOptions@settings$nCores <- 10
scenicOptions@settings$seed <- 123
# For a very quick run:
# coexMethod=c("top5perTarget")
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # For toy run
# save...
runSCENIC_1_coexNetwork2modules(scenicOptions)
runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) #** Only for toy run!! 只用於測試數據
runSCENIC_3_scoreCells(scenicOptions, logMat)
可選步驟
將network activity轉換成ON/OFF(二進制)格式
nPcs <- c(5) # For toy dataset
# nPcs <- c(5,15,50)
scenicOptions@settings$seed <- 123 # same seed for all of them
#使用不同的參數運行t-SNE
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50))
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50), onlyHighConf=TRUE, filePrefix="int/tSNE_oHC")
# 畫圖 (individual files in int/):
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_", list.files("int"), value=T), value=T))
par(mfrow=c(length(nPcs), 3))
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_AUC", list.files("int"), value=T, perl = T), value=T))
plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, varName="CellType", cex=.5)

# Using only "high-confidence" regulons (normally similar)
par(mfrow=c(3,3))
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_oHC_AUC", list.files("int"), value=T, perl = T), value=T))
plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, varName="CellType", cex=.5)
輸出到 loom/SCope
SCENIC生成的結果既能在http://scope.aertslab.org查看,還能用函數export2scope()(需要SCopeLoomR包)保存成.loom文件。
# DGEM (Digital gene expression matrix)
# (non-normalized counts)
# exprMat <- get_dgem(open_loom(loomPath))
# dgem <- exprMat
# head(colnames(dgem)) #should contain the Cell ID/name
# Export:
scenicOptions@fileNames$output["loomFile",] <- "output/mouseBrain_SCENIC.loom"
export2scope(scenicOptions, exprMat)
加載.loom文件中的結果
SCopeLoomR中也有函數可以導入.loom文件中的內容,比如調節因子,AUC和封裝內容(比如regulon activity的t-SNE和UMAP結果)。
library(SCopeLoomR)
scenicLoomPath <- getOutName(scenicOptions, "loomFile")
loom <- open_loom(scenicLoomPath)
# Read information from loom file:
regulons_incidMat <- get_regulons(loom)
regulons <- regulonsToGeneLists(regulons_incidMat)
regulonsAUC <- get_regulonsAuc(loom)
regulonsAucThresholds <- get_regulonThresholds(loom)
embeddings <- get_embeddings(loom)
解讀結果
1. 細胞狀態
AUCell提供跨細胞的調節子的活性,AUCell使用“Area under Curve 曲線下面積”(AUC)來計算輸入基因集的關鍵子集是否在每個細胞的表達基因中富集。通過該調節子活性(連續或二進制AUC矩陣)來聚類細胞,我們可以看出是否存在傾向於具有相同調節子活性的細胞群,並揭示在多個細胞中反復發生的網絡狀態。這些狀態等同於網絡的吸引子狀態。將這些聚類與不同的可視化方法相結合,我們可以探索細胞狀態與特定調節子的關聯。
將AUC和TF表達投射到t-SNE上
logMat <- exprMat # Better if it is logged/normalized
aucellApp <- plotTsne_AUCellApp(scenicOptions, logMat) # default t-SNE
savedSelections <- shiny::runApp(aucellApp)
print(tsneFileName(scenicOptions))
tSNE_scenic <- readRDS(tsneFileName(scenicOptions))
aucell_regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
# Show TF expression:
par(mfrow=c(2,3))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, exprMat, aucell_regulonAUC[onlyNonDuplicatedExtended(rownames(aucell_regulonAUC))[c("Dlx5", "Sox10", "Sox9","Irf1", "Stat6")],], plots="Expression")

# 保存AUC圖片:
Cairo::CairoPDF("output/Step4_BinaryRegulonActivity_tSNE_colByAUC.pdf", width=20, height=15)
par(mfrow=c(4,6))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, cellsAUC=aucell_regulonAUC, plots="AUC")
dev.off()
library(KernSmooth)
library(RColorBrewer)
dens2d <- bkde2D(tSNE_scenic$Y, 1)$fhat
image(dens2d, col=brewer.pal(9, "YlOrBr"), axes=FALSE)
contour(dens2d, add=TRUE, nlevels=5, drawlabels=FALSE)

#par(bg = "black")
par(mfrow=c(1,2))
regulonNames <- c( "Dlx5","Sox10")
cellCol <- plotTsne_rgb(scenicOptions, regulonNames, aucType="AUC", aucMaxContrast=0.6)
text(0, 10, attr(cellCol,"red"), col="red", cex=.7, pos=4)
text(-20,-10, attr(cellCol,"green"), col="green3", cex=.7, pos=4)
regulonNames <- list(red=c("Sox10", "Sox8"),
green=c("Irf1"),
blue=c( "Tef"))
cellCol <- plotTsne_rgb(scenicOptions, regulonNames, aucType="Binary")
text(5, 15, attr(cellCol,"red"), col="red", cex=.7, pos=4)
text(5, 15-4, attr(cellCol,"green"), col="green3", cex=.7, pos=4)
text(5, 15-8, attr(cellCol,"blue"), col="blue", cex=.7, pos=4)

GRN:Regulon靶標和模序
regulons <- loadInt(scenicOptions, "regulons")
regulons[c("Dlx5", "Irf1")]

regulons <- loadInt(scenicOptions, "aucell_regulons")
head(cbind(onlyNonDuplicatedExtended(names(regulons))))

regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
tableSubset <- regulonTargetsInfo[TF=="Stat6" & highConfAnnot==TRUE]
viewMotifs(tableSubset)
2. 細胞群的調控因子
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
regulonActivity_byCellType <- sapply(split(rownames(cellInfo), cellInfo$CellType),function(cells) rowMeans(getAUC(regulonAUC)[,cells]))
regulonActivity_byCellType_Scaled <- t(scale(t(regulonActivity_byCellType), center = T, scale=T))
pheatmap::pheatmap(regulonActivity_byCellType_Scaled, color=colorRampPalette(c("blue","white","red"))(100), breaks=seq(-3, 3, length.out = 100),treeheight_row=10, treeheight_col=10, border_color=NA)

# filename="regulonActivity_byCellType.pdf", width=10, height=20)
topRegulators <- reshape2::melt(regulonActivity_byCellType_Scaled)
colnames(topRegulators) <- c("Regulon", "CellType", "RelativeActivity")
topRegulators <- topRegulators[which(topRegulators$RelativeActivity>0),]
viewTable(topRegulators)
minPerc <- .7
binaryRegulonActivity <- loadInt(scenicOptions, "aucell_binary_nonDupl")
cellInfo_binarizedCells <- cellInfo[which(rownames(cellInfo)%in% colnames(binaryRegulonActivity)),, drop=FALSE]
regulonActivity_byCellType_Binarized <- sapply(split(rownames(cellInfo_binarizedCells), cellInfo_binarizedCells$CellType),function(cells) rowMeans(binaryRegulonActivity[,cells, drop=FALSE]))
binaryActPerc_subset <- regulonActivity_byCellType_Binarized[which(rowSums(regulonActivity_byCellType_Binarized>minPerc)>0),]
pheatmap::pheatmap(binaryActPerc_subset,color = colorRampPalette(c("white","pink","red"))(100), breaks=seq(0, 1, length.out = 100),treeheight_row=10, treeheight_col=10, border_color=NA)

