import pandas as pd ; import numpy as np ; from scipy import sparse
import scanpy as sc
import anndata as ad
import os
1、加载datafram格式的rna_counts数据集构建AnnData
X = pd.read_csv(r'./1.rnacounts.tsv', sep = '\t').transpose()
X = sparse.csr_matrix(X)
ad_1 = ad.AnnData(X = X.values, obs = pd.DataFrame(index = X.index), var = pd.DataFrame(index = X.columns))
ad_1.var_names_make_unique()
ad_1.obs_names_make_unique()
X = pd.read_csv(r'./2.rnacounts.tsv', sep = '\t').transpose()
X = sparse.csr_matrix(X)
ad_2 = ad.AnnData(X = X.values, obs = pd.DataFrame(index = X.index), var = pd.DataFrame(index = X.columns))
ad_2.var_names_make_unique()
ad_2.obs_names_make_unique()
X = pd.read_csv(r'./3.rnacounts.tsv', sep = '\t').transpose()
X = sparse.csr_matrix(X)
ad_3 = ad.AnnData(X = X.values, obs = pd.DataFrame(index = X.index), var = pd.DataFrame(index = X.columns))
ad_3.var_names_make_unique()
ad_3.obs_names_make_unique()
2、添加数据集的分组信息
ad_1.obs["batch"] = "batch_1"
ad_2.obs["batch"] = "batch_2"
ad_3.obs["batch"] = "batch_3"
print("ad_1:::",ad_1.shape,"\nad_2:::",ad_2.shape,"\nad_3:::",ad_3.shape)
3、合并多个AnnData对象为一个数据集
adList= [ad_1,ad_2,ad_3]
#ad_all =ad.concat(adList,join='outer') #合并同R语言Seurat的merge(scRNAList[[1]],scRNAList[2:length(scRNAList)])
ad_all = sc.AnnData.concatenate(*scRNAList,join='outer')
ad_all.var_names_make_unique()
ad_all.obs_names_make_unique()
sc.pp.calculate_qc_metrics(ad_all, percent_top=None,log1p=False,inplace=True) #counts统计
4、根据batch分组进行等比例抽取数据
N = 1000 #每组抽1000细胞
ad_tmp = ad_all[ad_all.obs.groupby("batch").sample(n = N, random_state=123,replace=False).index].copy
frac = 0.5 #每组抽50%的细胞
ad_tmp = ad_all[ad_all.obs.groupby("batch").sample(frac = frac, random_state=123,replace=False).index].copy
5、根据batch列的信息进行分层不等比采样
adList=[]
groups = ad_all.obs.groupby("batch").size()
for batch in group.index:
i = groups [batch]
frc = i / ad_all.obs.groupby("batch").size().sum()
N = int(round(frc*1000,0)) #总采取1000细胞,分层不等比抽取
_index = ad_all.obs[ ad_all.obs["batch"] == batch ].sample(n = N, random_state=123,replace=False).index
ad_tmp = ad_all[_index].copy()
adList.append(ad_tmp)
#ad_sub =ad.concat(adList,join='outer')
ad_sub = sc.AnnData.concatenate(*adList,join='outer')