-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdeal_vcf.py
87 lines (78 loc) · 3.56 KB
/
deal_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from collections import defaultdict
import re
import pandas as pd
import click
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
@click.version_option(version='1.0.0')
@click.command('decode_vcf',help='Decode vcf file and order sample or change to genotype. Example: python vcf.py --vcf test.vcf --change_line T --select_line A141,L506')
@click.option("--vcf", type=click.Path(exists=True), help="vcf file")
@click.option("--change_line", type=click.Choice(['false','f','F','true','t','T']),default='false',help=" change the order of sample")
@click.option("--select_line", type=str, default=False,help=" Sample name needed to be changed.(A141,L506) the first sample is the sample needed to be changed, the second sample will locate its next.")
@click.option("--vcf2genotype", type=click.Choice(['false','f','F','true','t','T']),default='false',help=" change the order of sample")
@click.option("-o","--outfile",default="out.txt",type=str,help="Output file (default 'out.txt')")
def decode_vcf(vcf,change_line,select_line,vcf2genotype,outfile):
try:
with open(vcf,'r') as IN,open('header.txt','w') as OUT:
header = []
raw_sample = ''
bady = []
bady_dict = defaultdict(list)
for line in IN: #extract header
if line.startswith('##'):
header.append(line)
elif line.startswith('#CHROM'):
raw_sample += line
else:
bady.append(line)
header = ''.join(header)
all_sample = re.findall(r'FORMAT\t(.*)',raw_sample)[0].split() #extract sample name
sample = raw_sample.split()
for i in range(len(sample)): #extract sample variation
for allsnp in bady:
snp = allsnp.split()
bady_dict[sample[i]].append(snp[i])
data = pd.DataFrame(dict(bady_dict))
######################################################################## fuction 1
if change_line in ['true','t','T']: #change the order of sample
col_need_change = select_line.split(',')[0]
where_need_change =select_line.split(',')[1]
data_need_change = data.loc[:,col_need_change]
data=data.drop(col_need_change, axis=1)
header_data = list(data.columns.values)
loc = header_data.index(where_need_change)
data.insert(loc, col_need_change,data_need_change)
print("Received {} samples and {} snps...".format(data.shape[1]-9,data.shape[0]))
OUT.write(header)
data.to_csv(outfile,sep='\t',index=False)
else:
pass
######################################################################### fuction 2
if vcf2genotype in ['true','t','T']:
pattern = {'GT':'K','AC':'M','AG':'R','CG':'S','CT':'Y','AT':'W','TG':'K','CA':'M','GA':'R','GC':'S','TC':'Y','TA':'W'}
ann_data = data.iloc[:,[0,1,3]]
df_change = pd.DataFrame() #crate all sample collector
for i in range(len(all_sample)): #extract sample variation
tmp = [] #crate per sample collector
for allsnp in bady: #3:ref 4:alt 9:ann number
snp = allsnp.split()
if snp[i+9].split(':')[0] == '0/0':
tmp.append(snp[3])
elif snp[i+9].split(':')[0] == '0/1':
locate = snp[3]+snp[4]
tmp.append(pattern[locate])
elif snp[i+9].split(':')[0] == '1/1':
tmp.append(snp[4])
elif snp[i+9].split(':')[0] == './.':
tmp.append('-')
else:
pass
df_change[all_sample[i]]=tmp
vcf2genotype = ann_data.join(df_change)
vcf2genotype.to_csv(outfile,sep='\t',index=False)
print('Finish vcf2genotype!')
else:
pass
except TypeError:
print("Please input your option, or use --help")
if __name__ == "__main__":
decode_vcf()