simulationdatabase/database.py at PUBLISH · smallfishabc/simulationdatabase · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import os
import pysnooper
import Sequence_Cider as Sc


# Load separate datafiles into a entire database
def load_data(df):
    # Add selected data to the dataframe
    # Add Radius of Gyration
    df_rg = add_data('BBrawRG_5en.csv', df, 'rg')
    # Add End to End distance
    df_ee = add_data('BBEERG_5en.csv', df, 'ee')
    # Add average hydrogen bonds number.(The format of HB file is slightly different from other data. This will
    # be fixed in future update)
    df_HB = add_data_HB('BB_HB_5en.csv', df, 'HB')
    # Add Helicity
    df_helix = add_data('BB_Heli_5en.csv', df, 'helix')
    # Add beta-sheet propensity
    df_beta = add_data('BB_Beta_5en.csv', df, 'beta')
    # Add interaction strength
    df_interaction = add_data('BBcontact_list.csv', df ,'interaction')
    #df_interaction = add_data('BBcontact_list_cutoff_far_standard_value.csv', df, 'interaction')
    # Add sequence features
    df_feature = add_data_sequence_feature(df)
    # df_interaction = add_data('BB_contact_lines.csv',df,'interaction')
    # Merge the data to a single dataframe
    fulldata = pd.concat([df_ee, df_rg, df_HB, df_helix, df_beta, df_interaction, df_feature])
    # Save the final data to a csv file
    # fulldata.to_csv('fulldata.csv', index=False)
    return fulldata


# Easy version (Designed for new analysis output)
def load_data_easy(df):
    # Add selected data to the dataframe
    # Add Radius of Gyration
    df_rg = add_data('BBrawRG_easy.csv', df, 'rg')
    # Add End to End distance
    df_ee = add_data('BBEERG_easy.csv', df, 'ee')
    # Add average hydrogen bonds number.(The format of HB file is slightly different from other data. This will
    # be fixed in future update)
    df_HB = add_data_HB('BB_HB_easy.csv', df, 'HB')
    # Add Helicity
    df_helix = add_data('BB_Heli_easy.csv', df, 'helix')
    # Add interaction strength
    # df_interaction = add_data('BBcontact_list.csv', df, 'interaction')
    # Add sequence features
    df_feature = add_data_sequence_feature(df)
    # df_interaction = add_data('BB_contact_lines.csv',df,'interaction')
    # Merge the data to a single dataframe
    fulldata = pd.concat([df_ee, df_rg, df_HB, df_helix, df_feature])
    # Save the final data to a csv file
    # fulldata.to_csv('fulldata.csv', index=False)
    return fulldata


# Add data function
def add_data(filename, entrydf, data_type):
    # Create an empty dataframe
    full_df = pd.DataFrame()
    # Using loop to add every Protein data into the dataframe
    for i in range(len(entrydf)):
        # Retrieve the Protein name
        protein_name = entrydf.loc[i, 'Protein']
        # Retrieve the Protein directory
        protein_directory = entrydf.loc[i, 'Directory']
        # Read data from the directory
        new_data = read_data(protein_directory, filename)
        # Add datatype to the raw data
        new_data.insert(0, 'datatype', data_type)
        # Add Protein name to the raw data
        new_data.insert(0, 'Protein', protein_name)
        # Add these data to the dataframe
        full_df = pd.concat([full_df, new_data], ignore_index=True)
    return full_df


# Because the HB file has different format, we need to rename the HB when attach to the database
# This have a similar structure with previous function and will be removed on next update
def add_data_HB(filename, entrydf, data_type):
    full_df = pd.DataFrame()
    for i in range(len(entrydf)):
        protein_name = entrydf.loc[i, 'Protein']
        protein_directory = entrydf.loc[i, 'Directory']
        new_data = read_data(protein_directory, filename)
        new_data.insert(0, 'datatype', data_type)
        new_data.insert(0, 'Protein', protein_name)
        full_df = pd.concat([full_df, new_data], ignore_index=True)
    full_df = full_df.rename(columns={'Hbond': 'Rs', 'st': 'Sd'}, errors="raise")
    return full_df


# This function will use cider and other computational package to calculate the sequence features of the Protein
# To be finished
def add_data_sequence_feature(entrydf, data_type='feature'):
    # Create an empty dataframe
    full_df = pd.DataFrame()
    # Using loop to add every Protein data into the dataframe
    for i in range(len(entrydf)):
        # Retrieve the Protein name
        protein_name = entrydf.loc[i, 'Protein']
        # Retrieve the Protein directory
        protein_directory = entrydf.loc[i, 'Directory']
        # Retrieve the Protein sequence
        sequence = entrydf.loc[i, 'Sequence']
        # Analyse the sequence feature
        length = len(sequence)
        print(sequence, length)
        # Will calculate specific sequence feature
        dict_feature = Sc.Cider_calculation(sequence)
        pd_feature = pd.DataFrame(dict_feature, index=[0])
        # Feng_calculation()
        # Create a new dataframe fo the sequence feature
        new_data = pd.DataFrame(
            {'Protein': protein_name, 'datatype': data_type, 'Sequence': sequence, 'length': length}, index=[0])
        new_data = pd.concat([new_data, pd_feature], axis=1)
        # Insert a dictionary containing all cider data
        # new_data.insert(3)
        # new_data.insert(0, 'datatype', data_type)
        # new_data.insert(0, 'Protein', protein_name)
        # Add these data to the 'full' dataframe
        full_df = pd.concat([full_df, new_data], ignore_index=True)
    return full_df


# Function for retrieving the data from certain directory
def read_data(directory, filename):
    filepath = os.path.join(directory, filename)
    data = pd.read_csv(filepath)
    return (data)