python脚本系列-两列数据互相转化
数据分析中常常会有这样的转换需求
脚本如下:
import sys infile = open(sys.argv[1],'r') res = open(sys.argv[2],'w') result = {} for line in infile: line = line.strip().split('\t') if len(line) < 2: continue accession = line[0] go = line[1] result.setdefault(accession,set()).add(go) #这样好处是不会有重复值 #这里可以换成列表的形式acc2go.setdefault(accession,[]).append(go) for acc,goi in acc2go.items(): #增添第二列为重复的数量 res.write("%s\t%d\t%s\n"%(acc,len(goi),",".join(goi))) res.close()
另一种
import sys num = {} old = open(sys.argv[1], 'r') for line in old: line = line.strip().split(' ') if line[0] not in num: num[line[0]] = line[1] else: num[line[0]] += f',{line[1]}' old.close() new = open(sys.argv[2], 'w') for key,value in num.items(): print(f'{key}\t{value}', file = new) new.close()
OK,换个思路,此时我们想把数据从右边的格式转换为左边的格式怎么做呢?
第一种
import sys def Trans_file(f1,f2): for i in f1.readlines(): j = i.split(' ') for k in j[1].split(','): m = j[0] + '\t' + k if(m[-1] != '\n'): m = m + '\n' print(m) f2.write(m) f1 = open(sys.argv[1],'r') f2 = open(sys.argv[2],'w') Trans_file(f1,f2) f1.close() f2.close()
第二种
import sys dict = {} final = open(sys.argv[2],'w') with open(sys.argv[1],'r') as p: for line in p: line = line.strip().split(' ') gene = line[0] dict[gene] = line[1] for key,value in dict.items(): for i in value.split(','): print(f'{key}\t{i}',file=final) final.close()
嗯,大功告成~~
另外,第一种情况有时候我们想要进行重复值求和时用python实现一下,稍微改动下语句即可(虽然这种处理在R中有一堆的函数可以处理):
num = {} old = open(sys.argv[1], 'r') for line in old: line = line.strip().split(' ') if line[0] not in num: num[line[0]] = float(line[1]) else: num[line[0]] += float(line[1]) old.close() new = open(sys.argv[2], 'w') for key,value in num.items(): print(f'{key}\t{value}', file = new) new.close()