Sequencing/optimal_solution.py at main · gotencoder/Sequencing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import numpy as np
import pandas as pd
import os
import glob


#Define a function to obtain the optimum ploidy solution given the two directories
def get_optimum_ploidy(direc):
	filenames = glob.glob(direc+'/*params.txt')
	optimum_ploidy = []
	seg_file = [] #Get the seg file that corresponds to the optimum ploidy solution
	sample_id_list = []
	ploidy_dif = []
	solution_list = []
	dir_len = len(direc)
	i=0
	for i in range(len(filenames)):
		#Find indexes where these samples are located
		sample_id = filenames[i][dir_len+1:dir_len+13]
		#print (filenames[i])
		#Now load in each filename and get the ploidy
		df = pd.read_csv(filenames[i], delimiter = "\t",header=None)
		ploidy = np.float(df.loc[1][1])
		absolute_sample_index = np.where(absolute_sample_list==sample_id)[0]
		if (len(absolute_sample_index)==0):
			continue
		absolute_ploidy = absolute_ploidy_list[absolute_sample_index]
		ploidy_dif += [np.absolute(ploidy-absolute_ploidy)]
		solution_list += [filenames[i][:-10]]
		seg_file +=  [filenames[i][:-10] + 'seg.txt']
		sample_id_list += [sample_id]
		#print (ploidy_dif)

	sample_id_list = np.array(sample_id_list)
	ploidy_dif = np.array(ploidy_dif)
	seg_file = np.array(seg_file)
	unique_sample_list = np.unique(sample_id_list)
	i=0
	wanted_index = []
	wanted_seg = []
	wanted_ploidy = []
	for i in range(len(unique_sample_list)):
		unique_ind = np.where(sample_id_list==unique_sample_list[i])[0]
		ploid_val = ploidy_dif[unique_ind][0].astype('float32')
		min_ind = np.where(np.unique(ploid_val)==np.min(ploid_val))[0]
		tot_index = unique_ind[min_ind][0]
		wanted_index += [tot_index]
		wanted_ploidy += [ploidy_dif[tot_index]]
		wanted_seg += [seg_file[tot_index]]


	#Now we want to find the index of the of the minumum of ploidy diff
	#min_ind = ploidy_dif.index(min(ploidy_dif))
	#optimum_ploidy += [ploidy_dif[min_ind]]
	#sol = solution_list[min_ind]
	#Now get the segmental CN files so we can obtain the CN
	#seg_file +=  [direc + sol + 'seg.txt']

	return wanted_ploidy,wanted_seg


#Load in the absolute values
absolute_file = 'ABSOLUTE_file_subset_to_XISTpos_males_TPM_3.csv'
df_abs = pd.read_csv(absolute_file)
absolute_sample_list = np.array([elem1[:-3] for ind1, elem1 in enumerate(list(df_abs['array']))])
absolute_ploidy_list = np.array(df_abs['ploidy'])
absolute_purity_list = df_abs['purity']


#Aim of this script is to choose the optimal solution in Titan wrt ABSOLUTE
#We will do this by a ploid comparison
#We will first consider the solutions in each ploidy directory

ploidy2_dir = '/czlab/inasim/TitanCNA/scripts/snakemake/results/titan/hmm/titanCNA_ploidy2'
ploidy3_dir = '/czlab/inasim/TitanCNA/scripts/snakemake/results/titan/hmm/titanCNA_ploidy3'

'''
#Now we want the parameter files in these dirs
optimum_ploidy = []
seg_file = [] #Get the seg file that corresponds to the optimum ploidy solution
filenames = glob.glob(ploidy2_dir+'/*params.txt')
for file_name in filenames:
	ploidy_dif = []
	solution_list = []
	if (file_name[-10:]=='params.txt'):
		sample_id = file_name[:12]
		print (file_name)
		#Now load in each filename and get the ploidy
		df = pd.read_csv(file_name, delimiter = "\t",header=None)
		ploidy = np.float(df.loc[1][1])
		absolute_sample_index = np.where(absolute_sample_list==sample_id)[0]
		absolute_ploidy = absolute_ploidy_list[absolute_sample_index]
		ploidy_dif += [np.absolute(ploidy-absolute_ploidy)]
		solution_list += [file_name[-10:]]

	#Now we want to find the index of the of the minumum of ploidy diff
	min_ind = ploidy_dif.index(min(ploidy_dif))
	optimum_ploidy = ploidy_dif[min_ind]
	sol = solution_list[min_ind]
	#Now get the segmental CN files so we can obtain the CN
	seg_file =  sol + 'seg.txt'
'''

direc = ploidy2_dir
filenames = glob.glob(direc+'/*params.txt')
optimum_ploidy = []
seg_file = [] #Get the seg file that corresponds to the optimum ploidy solution
sample_id_list = []
ploidy_dif = []
solution_list = []
dir_len = len(direc)
real_index = []
for i in range(len(filenames)):
	sample_id = filenames[i][dir_len+1:dir_len+13]
	#print (filenames[i])
	#Now load in each filename and get the ploidy
	df = pd.read_csv(filenames[i], delimiter = "\t",header=None)
	ploidy = np.float(df.loc[1][1])
	absolute_sample_index = np.where(absolute_sample_list==sample_id)[0]
	#print (absolute_sample_index)
	if (len(absolute_sample_index)==0):
		continue

	absolute_ploidy = absolute_ploidy_list[absolute_sample_index]
	ploidy_dif += [np.absolute(ploidy-absolute_ploidy)]
	solution_list += [filenames[i][:-10]]
	seg_file +=  [filenames[i][dir_len+1:-10] + 'seg.txt']
	real_index += [i]
	sample_id_list += [sample_id]
	#print (ploidy_dif)

sample_id_list = np.array(sample_id_list)
ploidy_dif = np.array(ploidy_dif)
unique_sample_list = np.unique(sample_id_list)
seg_file = np.array(seg_file)
i=0
wanted_index = []
wanted_seg = []
wanted_ploidy = []
for i in range(len(unique_sample_list)):
	unique_ind = np.where(sample_id_list==unique_sample_list[i])[0]
	ploid_val = ploidy_dif[unique_ind][0].astype('float32')
	min_ind = np.where(ploid_val==np.min(ploid_val))[0]
	tot_index = unique_ind[min_ind][0]
	wanted_index += [tot_index]
	wanted_ploidy += [ploidy_dif[tot_index]]
	wanted_seg += [seg_file[tot_index]]


i=0
#Now let us obtain the correct solutions for each ploidy case and then compare them
ploidy2_sol, ploidy2_seg = get_optimum_ploidy(ploidy2_dir)
ploidy3_sol, ploidy3_seg = get_optimum_ploidy(ploidy3_dir)


#check that these arrays are equal length
if len(ploidy2_sol)==len(ploidy3_sol):
	print ('True')

n_sample = len(ploidy2_sol)
i=0
final_seg = []
for i in range(n_sample):
	ploid_comp = [np.min(ploidy2_sol[i]),np.min(ploidy3_sol[i])]
	seg_comp = [ploidy2_seg[i],ploidy3_seg[i]]
	final_ind = ploid_comp.index(min(ploid_comp))
	final_seg += [seg_comp[final_ind]]

#This should give us the list of seg files to analyse