1.前言:
RNA测序技术允许研究人员在转录组水平上精细地检测基因表达,包括等位基因特异性表达的变异。通过比较来自同一基因的不同等位基因的表达量,可以揭示细胞内遗传和表观遗传调控机制的差异。本代码通过对RNA测序数据中的读数计数进行详细分析,旨在检测和量化等位基因不平衡。通过优化统计模型来估计等位基因表达的差异,研究人员可以识别出在特定生物学条件下受到调控的基因区域。
其实和DNA的处理是差不多的,只是测序到的数值水平不同,以及基因的表达有所差异
2.demo
2.1 加载包和读取数据
suppressMessages(library(data.table)) suppressMessages(library(plyr)) suppressMessages(library(dplyr)) suppressMessages(library(rmutil)) filename.count <- "rna_allele_count.csv" filename.output <- "rna_result.csv" # counts cnt <- read.csv(filename.count)
2.2 数据预处理
从CSV文件中加载RNA测序计数数据,去除映射错误大于2的位点,以提高分析的准确性。
# filter sites with more than 2 read mapping to error allele cnt<- subset(cnt, errors <= 2) # optimize dispersion and error mtmp <- function(par, x, n, ge){ # likelihood, conditional on genotype error term1<- 0.5 * dbetabinom(x, n, m= par[1], s= par[2]) term2<- 0.5 * dbetabinom(n - x, n, m= par[1], s= par[2]) err.like <- (ge)*(term1 + term2) # likelihood, conditional on no genotype error het.like = (1-ge) * dbetabinom(x, n, m= 0.5, s= par[2]) # log likelihood ll<- -sum(log(err.like + het.like)) # return return(ll) } # optimize m0<- optim(par= c(1e-05,1), mtmp, x= cnt$ref.matches, n= cnt$N, ge= cnt$genotype.error, method="L-BFGS-B", lower = c(1e-10, 1e-05), upper = c(0.999, 100)) # get parameter err = m0$par[1] d = m0$par[2]
2.3 统计建模和优化,提取基因
应用二项贝塔分布对每个位点的读数计数进行建模,考虑基因型误差。使用优化函数(如L-BFGS-B方法)来估计模型参数,包括等位基因不平衡比例和分散参数。
# likelihood function ll.fun <- function(par, x, n, ge, err, d) { # likelihood for first site allelic.imbalance <- par # likelihood, conditional on genotype error t1 = 0.5 * dbetabinom(x[1], n[1], m=err, s=d) t2 = 0.5 * dbetabinom(n[1]-x[1], n[1], m=err, s=d) er1 = ge[1] * (t1 + t2) # likelihood, conditional on no genotype error d1 = (1 - ge[1]) * dbetabinom(x[1], n[1], m = 0.5 + allelic.imbalance, s = d) # combined likelihood p1 = er1 + d1 # for subsequent sites len = length(x) if(len > 1) { # likelihood given genotyping error ts1 <- 0.5 * dbetabinom(x[2:len], n[2:len], m=err , s=d) ts2 <- 0.5 * dbetabinom(n[2:len]- x[2:len], n[2:len], m=err, s=d) er2 <- (ge[2:len])*(ts1 + ts2) # consider all possible phasings with respect to the first SNP # precompute likelihoods of each subsequent SNP given 'in-phase' with first SNP snp.phase1.like <- (1-ge[2:len])*dbetabinom(x[2:len], n[2:len], m=0.5 + allelic.imbalance, s = d) + er2 # precompute likelihoods of each subsequent SNP given 'out-of-phase' with first SNP snp.phase0.like <- (1-ge[2:len])*dbetabinom(x[2:len], n[2:len], m=0.5 - allelic.imbalance, s = d) + er2 # create phase array phase1.like.array <- rep(NA, len) phase0.like.array <- rep(NA, len) # add likelihood for first site phase1.like.array[1] <- p1 phase0.like.array[1] <- p1 for(i in 2:len) { # prior SNP was either in-phase or out-of-phase with first SNP, consider # both mutually exclusive possibilities when computing combined likelihood of # all possible combinations of phases prev <- (0.5 * phase1.like.array[i-1]) + (0.5 * phase0.like.array[i-1]) phase1.like.array[i] <- prev * snp.phase1.like[i-1] phase0.like.array[i] <- prev * snp.phase0.like[i-1] } # total likelihood is sum of last two elements l = -log(0.5*phase1.like.array[len] + 0.5*phase0.like.array[len]) } else { l = -sum(log(p1)) } return(l) } # get genes genes= cnt$gene_id %>% unique %>% as.character
2.4 等位基因不平衡分析
对于每个基因,整合其所有位点的数据,通过最小化似然函数来估计等位基因不平衡的程度。这包括计算每个位点的等位基因表达的似然度,以及它们相对于第一个位点的相位(phase)似然度。
# function to run allele imbalance measurements fun = function(i){ # subset df= cnt[cnt$gene_id %in% genes[i],] # order df= df[order(df$start),] # number of het sites n.snp= nrow(df) # optimize m1 <- optimize(ll.fun, c(-0.5, 0.5), x= df$ref.matches, n= df$N, ge= df$genotype.error, err= err, d= d) # estimates of allelic.imbalance alt.ll <- m1$objective estimate <- m1$minimum # NULL hypothesis null.ll= ll.fun(par = 0, x= df$ref.matches, n = df$N, ge = df$genotype.error, err = err, d = d) # Likelihood ratio test lrt.stat <- 2 * (null.ll - alt.ll) pval <- pchisq(lrt.stat, df=1, lower.tail=F) result= data.frame(gene_id= genes[i], pval =pval, a= estimate) # return return(result) }
2.5 整合结果数据
对每个基因的分析结果进行汇总,应用FDR(假发现率)校正来调整P值,最后将整合的结果输出到CSV文件中。
my.list= llply(1:length(genes), fun, .progress = progress_text(char= "+")) # combine list res= do.call(rbind, my.list) # correct p-values res$fdr= p.adjust(res$pval, "fdr") # order res= res[order(res$pval),] # output results write.csv(res, filename.output, row.names = F)
下集预告:拷贝数变异的分析~