上次说了除了FPKM/RPKM标准化,我们可以直接拿原始Counts去进行差异分析,基于Deseq2、edgeR、limma三个包简单介绍一下分析流程。其中edgeR包在无生物学重复的研究中也用的较多~~
数据准备:counts矩阵及分组文件
Deseq2
rm(list = ls()) #Step1 Deseq2分析 library(DESeq2) ##数据预处理 database <- read.table(file = "../data/CvsNc.txt", sep = "\t", header = T, row.names = 1) database <- round(as.matrix(database)) ##设置分组信息并构建dds对象 condition <- factor(c(rep("C",3),rep("Nc",3)), levels = c("C", "Nc")) coldata <- data.frame(row.names = colnames(database), condition) dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition) #注: 这一步到下一步之间可以过滤掉一些low count数据,节省内存,提高运行速度 nrow(dds) dds <- dds[ rowSums(counts(dds)) > 1, ] ##使用DESeq函数估计离散度,然后差异分析获得res对象 dds <- DESeq(dds, parallel = FALSE) #parallel = TRUE 将启用多线程模式 suppressMessages(dds) res <- results(dds,pAdjustMethod = 'fdr',alpha = 0.05)#默认p阈值0.1,可以通过alpha设定 #最后设定阈值,筛选差异基因,导出数据(全部数据。包括标准化后的count数) res <- res[order(res$padj),] resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE) write.csv(resdata,file = "../result1/CvsNC.csv",row.names = F) #此方法得到的差异基因 diff_gene <- subset(res, padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1)) diff_gene_Deseq2 <- row.names(diff_gene)
edgeR
对于该包,官方文档上针对有无生物学重复都有相应的流程
有生物学重复时候
To view documentation for the version of this package installed in your system, start R and enter: browseVignettes("edgeR")
library(edgeR) ##跟DESeq2一样,导入数据,预处理(用了cpm函数) exprSet<- read.table(file = "../data/CvsNc.txt", sep = "\t", header = TRUE, row.names = 1, stringsAsFactors = FALSE) group_list <- factor(c(rep("C",3),rep("Nc",3))) exprSet <- exprSet[rowSums(cpm(exprSet) > 1) >= 2,] ##设置分组信息,并做TMM标准化 exprSet <- DGEList(counts = exprSet, group = group_list) exprSet <- calcNormFactors(exprSet) ##使用qCML(quantile-adjusted conditional maximum likelihood)估计离散度(只针对单因素实验设计) exprSet <- estimateCommonDisp(exprSet) exprSet <- estimateTagwiseDisp(exprSet) ##寻找差异gene(这里的exactTest函数还是基于qCML并且只针对单因素实验设计),然后按照阈值进行筛选即可 et <- exactTest(exprSet) tTag <- topTags(et, n=nrow(exprSet)) resdata2 <- merge(as.data.frame(tTag$table),as.data.frame(exprSet$counts),by="row.names",sort=FALSE)%>%tibble::column_to_rownames('Row.names') write.csv(resdata2,file = "../result1/CvsNc_edgeR.csv") diff_gene_edgeR <- subset(resdata2, FDR < 0.05 & (logFC > 1 | logFC < -1)) diff_gene_edgeR <- row.names(diff_gene_edgeR)
无生物学重复时候
要指定其中的bcv参数, 就是选择一个认为合理的离散度的值。通常如果是实验控制的好的人类数据,那么选择BCV=0.4,比较好的模式生物选择BCV=0.1,技术重复的话选择BCV=0.1
targets <- as.matrix(read.delim('gene.txt', sep = '\t', row.names = 1)) dgelist <- DGEList(counts = targets, group = group) #参数很简单 你的矩阵和分组文件 # bcv是官方文档的推荐数值(对应人的,对应其他物种的值不清楚),可以自己调整 bcv = 0.1 et <- exactTest(y, dispersion=bcv^2) write.csv(topTags(dge_et, n = nrow(dgelist$counts)), 'exactTest.csv', quote = FALSE) #输出主要结果
limma
##数据预处理 library(limma) library(edgeR) #limma包里对RNA-Seq差异分析的limma-trend方法,该方法主要适用于样本间测序深度基本保持一致的情况,也就是说两个样本的文库(reads数目)大小相差的不悬殊,当文库大小在样本间变化幅度相当大的话,可以使用limma的voom方法,可按照下面的代码流程: counts <- read.table(file = "../data/CvsNc.txt", sep = "\t", header = T, row.names = 1) dge <- DGEList(counts = counts) dge <- calcNormFactors(dge) #标准化下 #设置分组信息 group_list <- factor(c(rep("C",3), rep("Nc",3))) design <- model.matrix(~group_list) colnames(design) <- levels(group_list) rownames(design) <- colnames(counts) #接下来进行voom转化 v <- voom(dge, design, plot=TRUE) #其实可以不进行TMM标准化,直接用count数据进行voom转化,如: v <- voom(counts, design, plot=TRUE) #差异分析 fit <- lmFit(v, design) fit <- eBayes(fit) output <- topTable(fit, coef=2,n=Inf) resdata3 <- merge(output,counts,by="row.names",sort=FALSE)%>%tibble::column_to_rownames('Row.names') write.csv(resdata3,file = "../result1/CvsNc_limma.csv") diff_gene_limma <- subset(resdata3, adj.P.Val < 0.05 & (logFC > 1 | logFC < -1)) diff_gene_limma <- row.names(diff_gene_limma)
至此我们已经用了三种方法针对同一基因矩阵进行了差异分析,我们平常做时候可以都跑一遍流程,然后再去取个交集,这里我简单画个Venn图展示下相交情况.
交集
#展示3种方法的相交情况 all_gene <- intersect(intersect(diff_gene_Deseq2,diff_gene_edgeR),diff_gene_limma) library(VennDiagram) 作图 venn_list <- list(diff_gene_Deseq2,diff_gene_edgeR,diff_gene_limma) names(venn_list) <- c('Deseq2','edgeR','limma') venn.diagram(venn_list, filename = '../figures//venn.png', fill = c('red', 'blue','gold'), alpha = 0.50, col = 'black', cex = 1, fontfamily = 'serif', cat.col = c('black', 'black','black'), cat.cex = 1, cat.fontfamily = 'serif', margin = 0.2)
我们再以Deseq2的结果简单画个火山图展示一下吧~由于火山图点太多,我们只展示上调和下调按照p.adjust值排名的前10个吧
#确定是上调还是下调,用于给图中点上色 library(ggrepel) colnames(resdata)[1] <- 'Gene' resdata$threshold = factor(ifelse(resdata$padj < 0.05 & abs(resdata$log2FoldChange) >= 1, ifelse(resdata$log2FoldChange>= 1 ,'Up','Down'),'NoSignifi'),levels=c('Up','Down','NoSignifi')) #上调 和下调的前10个标记出来 up <- subset(resdata, threshold == 'Up') up <- up[order(up$padj), ][1:10, ] down <- subset(resdata, threshold == 'Down') down <- down[order(down$padj), ][1:10, ] plot <- ggplot(resdata,aes(x=log2FoldChange,y=-log10(padj),color=threshold))+ geom_point()+ scale_color_manual(values=c("#DC143C","#00008B","#808080"))+#确定点的颜色 geom_text_repel(data = rbind(up, down), aes(x = log2FoldChange, y = -log10(padj), label = Gene), size = 3,box.padding = unit(0.5, 'lines'), segment.color = 'black', show.legend = FALSE)+ theme_bw()+#修改图片背景 theme( legend.title = element_blank()#不显示图例标题 )+ ylab('-log10 (p-adj)')+#修改y轴名称 xlab('log2 (FoldChange)')+#修改x轴名称 geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.5) +#添加横线|FoldChange|>2 geom_hline(yintercept = -log10(0.05),lty=3,col="black",lwd=0.5)#添加竖线padj<0.05 plot ggsave(paste('../figures/','TgvsNc.tiff'),plot, width = 16, height = 10) diff_gene <- subset(resdata, padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1)) write.csv(diff_gene, paste('../results/','CvsNc_DESeq2.csv'), row.names = FALSE, quote = FALSE)
OK, 拿到差异基因之后,我们就可以去拿着这些geneID去做富集分析了,或者GSEA基因集富集,这些就留到下次再说吧~~