-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdatabase.py
More file actions
131 lines (119 loc) · 5.44 KB
/
database.py
File metadata and controls
131 lines (119 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import os
import pysnooper
import Sequence_Cider as Sc
# Load separate datafiles into a entire database
def load_data(df):
# Add selected data to the dataframe
# Add Radius of Gyration
df_rg = add_data('BBrawRG_5en.csv', df, 'rg')
# Add End to End distance
df_ee = add_data('BBEERG_5en.csv', df, 'ee')
# Add average hydrogen bonds number.(The format of HB file is slightly different from other data. This will
# be fixed in future update)
df_HB = add_data_HB('BB_HB_5en.csv', df, 'HB')
# Add Helicity
df_helix = add_data('BB_Heli_5en.csv', df, 'helix')
# Add beta-sheet propensity
df_beta = add_data('BB_Beta_5en.csv', df, 'beta')
# Add interaction strength
df_interaction = add_data('BBcontact_list.csv', df ,'interaction')
#df_interaction = add_data('BBcontact_list_cutoff_far_standard_value.csv', df, 'interaction')
# Add sequence features
df_feature = add_data_sequence_feature(df)
# df_interaction = add_data('BB_contact_lines.csv',df,'interaction')
# Merge the data to a single dataframe
fulldata = pd.concat([df_ee, df_rg, df_HB, df_helix, df_beta, df_interaction, df_feature])
# Save the final data to a csv file
# fulldata.to_csv('fulldata.csv', index=False)
return fulldata
# Easy version (Designed for new analysis output)
def load_data_easy(df):
# Add selected data to the dataframe
# Add Radius of Gyration
df_rg = add_data('BBrawRG_easy.csv', df, 'rg')
# Add End to End distance
df_ee = add_data('BBEERG_easy.csv', df, 'ee')
# Add average hydrogen bonds number.(The format of HB file is slightly different from other data. This will
# be fixed in future update)
df_HB = add_data_HB('BB_HB_easy.csv', df, 'HB')
# Add Helicity
df_helix = add_data('BB_Heli_easy.csv', df, 'helix')
# Add interaction strength
# df_interaction = add_data('BBcontact_list.csv', df, 'interaction')
# Add sequence features
df_feature = add_data_sequence_feature(df)
# df_interaction = add_data('BB_contact_lines.csv',df,'interaction')
# Merge the data to a single dataframe
fulldata = pd.concat([df_ee, df_rg, df_HB, df_helix, df_feature])
# Save the final data to a csv file
# fulldata.to_csv('fulldata.csv', index=False)
return fulldata
# Add data function
def add_data(filename, entrydf, data_type):
# Create an empty dataframe
full_df = pd.DataFrame()
# Using loop to add every Protein data into the dataframe
for i in range(len(entrydf)):
# Retrieve the Protein name
protein_name = entrydf.loc[i, 'Protein']
# Retrieve the Protein directory
protein_directory = entrydf.loc[i, 'Directory']
# Read data from the directory
new_data = read_data(protein_directory, filename)
# Add datatype to the raw data
new_data.insert(0, 'datatype', data_type)
# Add Protein name to the raw data
new_data.insert(0, 'Protein', protein_name)
# Add these data to the dataframe
full_df = pd.concat([full_df, new_data], ignore_index=True)
return full_df
# Because the HB file has different format, we need to rename the HB when attach to the database
# This have a similar structure with previous function and will be removed on next update
def add_data_HB(filename, entrydf, data_type):
full_df = pd.DataFrame()
for i in range(len(entrydf)):
protein_name = entrydf.loc[i, 'Protein']
protein_directory = entrydf.loc[i, 'Directory']
new_data = read_data(protein_directory, filename)
new_data.insert(0, 'datatype', data_type)
new_data.insert(0, 'Protein', protein_name)
full_df = pd.concat([full_df, new_data], ignore_index=True)
full_df = full_df.rename(columns={'Hbond': 'Rs', 'st': 'Sd'}, errors="raise")
return full_df
# This function will use cider and other computational package to calculate the sequence features of the Protein
# To be finished
def add_data_sequence_feature(entrydf, data_type='feature'):
# Create an empty dataframe
full_df = pd.DataFrame()
# Using loop to add every Protein data into the dataframe
for i in range(len(entrydf)):
# Retrieve the Protein name
protein_name = entrydf.loc[i, 'Protein']
# Retrieve the Protein directory
protein_directory = entrydf.loc[i, 'Directory']
# Retrieve the Protein sequence
sequence = entrydf.loc[i, 'Sequence']
# Analyse the sequence feature
length = len(sequence)
print(sequence, length)
# Will calculate specific sequence feature
dict_feature = Sc.Cider_calculation(sequence)
pd_feature = pd.DataFrame(dict_feature, index=[0])
# Feng_calculation()
# Create a new dataframe fo the sequence feature
new_data = pd.DataFrame(
{'Protein': protein_name, 'datatype': data_type, 'Sequence': sequence, 'length': length}, index=[0])
new_data = pd.concat([new_data, pd_feature], axis=1)
# Insert a dictionary containing all cider data
# new_data.insert(3)
# new_data.insert(0, 'datatype', data_type)
# new_data.insert(0, 'Protein', protein_name)
# Add these data to the 'full' dataframe
full_df = pd.concat([full_df, new_data], ignore_index=True)
return full_df
# Function for retrieving the data from certain directory
def read_data(directory, filename):
filepath = os.path.join(directory, filename)
data = pd.read_csv(filepath)
return (data)