-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTCGA_DS_script.py
More file actions
127 lines (88 loc) · 4.19 KB
/
TCGA_DS_script.py
File metadata and controls
127 lines (88 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import csv
import pandas as pd
import numpy as np
import shutil
import os
import openpyxl
import matplotlib as mpl
import matplotlib.pylab as plt
df = pd.read_csv("TCGA_VAF_all_samples_ref_for_Titan_baitset_annotated.csv")
#change to new file with more options
uq_baitsets = ['Agilent Custom', 'Nimblegen hg18', 'Nimblegen HGSC', 'Nimblegen.SQEZ2', 'Nimblegen.SQEZ3', 'Sureselect.38']
case_id_superlist = []
extension_superlist = []
for a in uq_baitsets:
df_temp = df[df['Baitset']==a]
case_id_superlist.append(df_temp['Sub_ID'].tolist())
extension_superlist.append(df_temp['Extension'].tolist())
p=0
while p<len(case_id_superlist):
case_id = case_id_superlist[p]
extension_list = extension_superlist[p]
filename = '/tcga-artemis/tcga_WXS/tcga_WXS_BAM_HG38_FILES_gdc_sample_sheet.tsv' #This is important
i=0
df = pd.read_csv(filename, sep='\t')
col_list = list(df.columns) #List column names
sample_type_tum = ['Primary Tumor', 'Metastatic','Primary Blood Derived Cancer - Peripheral Blood']#
sample_type_normal = ['Blood Derived Normal', 'Solid Tissue Normal'] #Two types of normals we can have
ref_list = df['Case ID']
hg38_dir = '/tcga-artemis/tcga_WXS/tcga_WXS_BAM_HG38_FILES/' #Set relative path
sample_list_for_titan = []
i=0
tum_path = []
norm_path = []
total_index_paths = []
valid_case_ids = []
tum_filename = []
norm_filename = []
for i in range(len(case_id)):
#Get the index that matches the case_id
#First check if the reference exists - if not add ['0'] so we can filter out in post-processing
if len(np.where(df['Case ID']==case_id[i])[0])<=1:
print (i)
continue
try:
if extension_list[i] == 1:
index_tum = np.where((df['Case ID']==case_id[i]) & ( (df['Sample Type']==sample_type_tum[0]) ) )[0] #Get the index for the tumour
elif extension_list[i] == 3:
index_tum = np.where((df['Case ID']==case_id[i]) & ( (df['Sample Type']==sample_type_tum[2]) ) )[0] #Get the index for the tumour
elif extension_list[i] == 6:
index_tum = np.where((df['Case ID']==case_id[i]) & ( (df['Sample Type']==sample_type_tum[1]) ) )[0] #Get the index for the tumour
index_norm = np.where((df['Case ID']==case_id[i]) & ( (df['Sample Type']==sample_type_normal[0]) | (df['Sample Type']==sample_type_normal[1]) ) )[0] #Get the index for the normal
#We now want both the file_id and filename
file_id_tum = np.array(df['File ID'][index_tum])[0]
file_name_tum = np.array(df['File Name'][index_tum])[0] #Our BAM file
#We now want both the file_id and filename
file_id_norm = np.array(df['File ID'][index_norm])[0]
file_name_norm = np.array(df['File Name'][index_norm])[0] #Our BAM file
# Now get the normal and tumour file paths
tum_path += [hg38_dir + file_id_tum +'/' + file_name_tum]
norm_path += [hg38_dir + file_id_norm +'/' + file_name_norm]
except:
print(i)
continue
tum_filename.append(file_name_tum)
norm_filename.append(file_name_norm)
valid_case_ids.append(case_id)
#generate config file
base_lst = valid_case_ids
lines_list = []
a=0
while a<len(base_lst):
lines_list.append(" " + str(base_lst[a]) + "_Tumor: " + str(tum_filename))
lines_list.append(" " + str(base_lst[a]) + "_Normal: " + str(norm_filename))
a=a+1
pairings_list = []
a=0
while a<len(base_lst):
pairings_list.append(" " + str(base_lst[a]) + "_Tumor: " + str(base_lst[a]) + "_Normal")
a=a+1
textfile = open("/czlab/inasim/samples_baitset_" + str(p) + ".yaml", "w")
textfile.write("samples:" + "\n")
for element in lines_list:
textfile.write(element + "\n")
textfile.write("pairings:" + "\n")
for element in pairings_list:
textfile.write(element + "\n")
textfile.close()
p=p+1