This repository was archived by the owner on May 2, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_report_data.py
175 lines (146 loc) · 6.78 KB
/
extract_report_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import glob
import pandas as pd
def create_report_dictionary(report_list, seq_list, id_column):
"""
:param report_list: List of paths to report files
:param seq_list: List of OLC Seq IDs
:param id_column: Column used to specify primary key
:return: Dictionary containing Seq IDs as keys and dataframes as values
"""
# Create empty dict to store reports of interest
report_dict = {}
# Iterate over every metadata file
for report in report_list:
# Check if the sample we want is in file
df = pd.read_csv(report)
samples = df[id_column]
# Check all of our sequences of interest to see if they are in the combinedMetadata file
for seq in seq_list:
if seq in samples.values:
# Associate dataframe with sampleID
report_dict[seq] = df
return report_dict
# TODO: Fix this for production to retrieve full list of combinedMetadata.csv reports
def get_combined_metadata(seq_list):
"""
:param seq_list: List of OLC Seq IDs
:return: Dictionary containing Seq IDs as keys and combinedMetadata dataframes as values
"""
# Grab every single combinedMetadata file we have
# all_reports = glob.glob('/mnt/nas/WGSspades/*/reports/combinedMetadata.csv')
metadata_reports = glob.glob('/home/dussaultf/Documents/COWBAT_TEST_V2/reports/combinedMetadata.csv')
metadata_report_dict = create_report_dictionary(report_list=metadata_reports, seq_list=seq_list, id_column='SeqID')
return metadata_report_dict
# TODO: Fix this for production to retrieve full list of GDCS reports
def get_gdcs(seq_list):
"""
:param seq_list: List of OLC Seq IDs
:return: Dictionary containing Seq IDs as keys and GDCS dataframes as values
"""
# Grab every single combinedMetadata file we have
# all_reports = glob.glob('/mnt/nas/WGSspades/*/reports/combinedMetadata.csv')
gdcs_reports = glob.glob('/home/dussaultf/Documents/COWBAT_TEST_V2/reports/GDCS.csv')
gdcs_report_dict = create_report_dictionary(report_list=gdcs_reports, seq_list=seq_list, id_column='Strain')
return gdcs_report_dict
def validate_genus(seq_list, genus):
"""
Validates whether or not the expected genus matches the observed genus parsed from combinedMetadata.
:param seq_list: List of OLC Seq IDs
:param genus: String of expected genus (Salmonella, Listeria, Escherichia)
:return: Dictionary containing Seq IDs as keys and a 'valid status' as the value
"""
metadata_reports = get_combined_metadata(seq_list)
valid_status = {}
for seqid in seq_list:
print('Validating {} genus'.format(seqid))
df = metadata_reports[seqid]
observed_genus = df.loc[df['SeqID'] == seqid]['Genus'].values[0]
if observed_genus == genus:
valid_status[seqid] = True # Valid genus
else:
valid_status[seqid] = False # Invalid genus
return valid_status
def validate_ecoli(seq_list, metadata_reports):
"""
Checks if the uidA marker and vt markers are present in the combinedMetadata sheets and stores True/False for
each SeqID. Values are stored as tuples: (uida_present, verotoxigenic)
:param seq_list: List of OLC Seq IDs
:param metadata_reports: Dictionary retrieved from get_combined_metadata()
:return: Dictionary containing Seq IDs as keys and (uidA, vt) presence or absence for values.
Present = True, Absent = False
"""
ecoli_seq_status = {}
for seqid in seq_list:
print('Validating {} uidA and vt marker detection'.format(seqid))
df = metadata_reports[seqid]
observed_genus = df.loc[df['SeqID'] == seqid]['Genus'].values[0]
uida_present = False
verotoxigenic = False
if observed_genus == 'Escherichia':
if 'uidA' in df.loc[df['SeqID'] == seqid]['GeneSeekr_Profile'].values[0]:
uida_present = True
if 'vt' in df.loc[df['SeqID'] == seqid]['Vtyper_Profile'].values[0]:
verotoxigenic = True
ecoli_seq_status[seqid] = (uida_present, verotoxigenic)
return ecoli_seq_status
def validate_mash(seq_list, metadata_reports, expected_species):
"""
Takes a species name as a string (i.e. 'Salmonella enterica') and creates a dictionary with keys for each Seq ID
and boolean values if the value pulled from MASH_ReferenceGenome matches the string or not
:param seq_list: List of OLC Seq IDs
:param metadata_reports: Dictionary retrieved from get_combined_metadata()
:param expected_species: String containing expected species
:return: Dictionary with Seq IDs as keys and True/False as values
"""
seq_status = {}
for seqid in seq_list:
print('Validating MASH reference genome for {} '.format(seqid))
df = metadata_reports[seqid]
observed_species = df.loc[df['SeqID'] == seqid]['MASH_ReferenceGenome'].values[0]
if observed_species == expected_species:
seq_status[seqid] = True
else:
seq_status[seqid] = False
return seq_status
def generate_validated_list(seq_list, genus):
"""
:param seq_list: List of OLC Seq IDs
:param genus: String of expected genus (Salmonella, Listeria, Escherichia)
:return: List containing each valid Seq ID
"""
# VALIDATION
validated_list = []
validated_dict = validate_genus(seq_list=seq_list, genus=genus)
for seqid, valid_status in validated_dict.items():
if validated_dict[seqid]:
validated_list.append(seqid)
else:
print('WARNING: '
'Seq ID {} does not match the expected genus of {} and was ignored.'.format(seqid, genus.upper()))
return validated_list
def parse_geneseekr_profile(value):
"""
Takes in a value from the GeneSeekr_Profile of combinedMetadata.csv and parses it to determine which markers are
present. i.e. if the cell contains "invA;stn", a list containing invA, stn will be returned
:param value: String delimited by ';' character containing markers
:return: List of markers parsed from value
"""
detected_markers = []
marker_list = ['invA', 'stn', 'IGS', 'hlyA', 'inlJ', 'VT1', 'VT2', 'VT2f', 'uidA', 'eae']
markers = value.split(';')
for marker in markers:
if marker in marker_list:
detected_markers.append(marker)
return detected_markers
def generate_gdcs_dict(gdcs_reports):
"""
:param gdcs_reports: Dictionary derived from get_gdcs() function
:return: Dictionary containing parsed GDCS values
"""
gdcs_dict = {}
for sample_id, df in gdcs_reports.items():
# Grab values
matches = df.loc[df['Strain'] == sample_id]['Matches'].values[0]
passfail = df.loc[df['Strain'] == sample_id]['Pass/Fail'].values[0]
gdcs_dict[sample_id] = (matches, passfail)
return gdcs_dict