-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmachineLearningRunner.py
More file actions
176 lines (128 loc) · 6.76 KB
/
machineLearningRunner.py
File metadata and controls
176 lines (128 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import functools
from sklearn.model_selection import train_test_split
from base import dateUtil
from base import fileUtil
from base import stockMongo
from base.parallel import runToAllDone
import numpy as np
import pandas as pd
def removeUnusedColumnsAndRows(df):
# remove unused columns
df.drop(['Adj Low', 'Adj High', 'Adj Open', 'Adj Volume', 'Ex-Dividend', 'Split Ratio', 'index', 'level_0'], axis=1, inplace=True, errors='ignore')
# replace string 'null' value with np.nan
df = df.replace('null', np.nan)
df.dropna(how='any', subset=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)
return df
def getQuoteData(machineLearningMode, startDate, endDate, symbol, history_days=0):
quotes = stockMongo.findLatestQuotesPeriod(symbol, machineLearningMode.QUOTES_NUMBER, startDate, endDate)
quotes = list(quotes)
if history_days is not None and history_days > 0:
quotes1 = stockMongo.findLatestQuotesPeriod(symbol, history_days, None, startDate, includeEndDate=False)
quotes.extend(list(quotes1))
df = pd.DataFrame(list(quotes))
if len(df) == 0:
return;
df = removeUnusedColumnsAndRows(df)
return df
def moreThanOneClassResult(array):
if array is None:
return False
uniqData = set(array)
return len(uniqData) > 1
def getLastRowDate(df):
return df['Date'].values[-1]
def getPickleName(symbol, mode):
return symbol + "_" + mode
def initialMachineLearning(machineLearningMode, startDate, endDate, symbol):
quotes = getQuoteData(machineLearningMode, startDate, endDate, symbol)
if quotes is None or len(quotes) < machineLearningMode.MINIMUN_MACHINE_LEARNING_NUMBERS:
return
X, y, df = machineLearningMode.extract_featureset(quotes)
if not moreThanOneClassResult(y):
print("only has 1 classes result, no training...")
return
lastRecordDate = getLastRowDate(df)
clf = machineLearningMode.getClassifier()
clf.fit(X, y) # use full data for training
data = {"lastRecordDate": lastRecordDate, "clf": clf}
fileUtil.pickleIt(getPickleName(symbol, machineLearningMode.MODE), data)
print(symbol, ' machine learning is done...')
def getClassifierAccuracy(machineLearningMode, startDate, endDate, symbol):
quotes = getQuoteData(machineLearningMode, startDate, endDate, symbol)
if quotes is None or len(quotes) < machineLearningMode.MINIMUN_MACHINE_LEARNING_NUMBERS :
return None, 0
X, y, df = machineLearningMode.extract_featureset(quotes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=machineLearningMode.TEST_SIZE)
if not moreThanOneClassResult(y_train):
print("only has 1 classes result, no training...")
return None, 0
clf = machineLearningMode.getClassifier()
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
return confidence, len(X_train)
def saveAccuracy(machineLearningMode, startDate, endDate, symbol):
try:
confidence, trainNumber = getClassifierAccuracy(machineLearningMode, startDate, endDate, symbol)
print(confidence, trainNumber)
if trainNumber > 0 :
stockMongo.saveLearnAccuracy(symbol, machineLearningMode.MODE, confidence, trainNumber)
except Exception as e:
print('...error while saveAccuracy for ', symbol)
print(str(e))
print('.............................')
def quotePredict(machineLearningMode, symbol, X, dates):
if X is None or len(X) == 0 :
# print('no featureset data for ', symbol)
return
ml_data = fileUtil.loadPickle(getPickleName(symbol, machineLearningMode.MODE))
if ml_data is None:
return
prediction = ml_data['clf'].predict(X)
# TODO get minimun date and it shall be great than the machine learning date
return list(zip(prediction, dates))
def predict(machineLearningMode, date, symbol):
startDate = date
endDate = date
quotes = getQuoteData(machineLearningMode, startDate, endDate, symbol, history_days=machineLearningMode.QUOTE_HISTORY_DAYS)
X, y, df = machineLearningMode.extract_featureForPredict(quotes, startDate, endDate)
return quotePredict(machineLearningMode, symbol, X, df['Date'].values)
def verifyQuote(machineLearningMode, startDate, endDate, symbol):
print("verifyQuote", symbol)
history_days = 10
quotes = getQuoteData(machineLearningMode, startDate, endDate, symbol, history_days=history_days)
X, y, df = machineLearningMode.extract_featureset(quotes)
predictions = quotePredict(machineLearningMode, symbol, X, df['Date'].values)
# verify if prediction is correct
df['result'] = list(map(machineLearningMode.determineResult, df['nextClosePercentage']))
prediction_df = pd.DataFrame(predictions, columns=['prediction', 'Date'])
# qualified is prediction either 1 or -1
prediction_df = pd.merge(prediction_df, df[['Date', 'result']], on='Date')
# save prediction
for index, row in prediction_df.iterrows():
predictObj = {"Symbol":symbol, "Date":row['Date'], "Prediction":int(row['prediction']), "Mode":machineLearningMode.MODE, "isCorrect": (row['prediction'] == row['result']), "Result":row['result']}
stockMongo.savePrediction(predictObj)
# only care None-Zero result
accuracy_prediction_df = prediction_df.loc[(prediction_df['result'] != 0) | (prediction_df['prediction'] != 0)]
correct_prediction_df = accuracy_prediction_df.loc[accuracy_prediction_df['result'] == accuracy_prediction_df['prediction']]
if len(accuracy_prediction_df) == 0 :
print("{} prediction_df has no Non-Zero data".format(symbol))
return
result = len(correct_prediction_df) / len(accuracy_prediction_df)
if result > 0.75 :
print(result)
# the data before sepDate will be used as machine learning data
# the data after sepDate will be used for verification
def learn(machineLearningMode, sepDate, symbols=[]):
if not sepDate:
sepDate = dateUtil.toString(dateUtil.addDays(dateUtil.nowString(), -45)) #-45 days is roughly two months
if not symbols:
symbols = pd.DataFrame(list(stockMongo.findAllActiveSymbols()))['Symbol'].values
# symbols = ['KO']
learningFunction = functools.partial(initialMachineLearning, machineLearningMode, None, sepDate)
runToAllDone(learningFunction, [(symbol,) for symbol in symbols])
saveAccuracyFunc = functools.partial(saveAccuracy, machineLearningMode, None, sepDate)
runToAllDone(saveAccuracyFunc, [(symbol,) for symbol in symbols])
# verify the prediction of the rest data
startDate = sepDate
endDate = None
runToAllDone(functools.partial(verifyQuote, machineLearningMode, startDate, endDate), [(symbol,) for symbol in symbols])