-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredict.py
More file actions
54 lines (46 loc) · 1.63 KB
/
predict.py
File metadata and controls
54 lines (46 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
df = pd.read_csv('prescriber-info.csv')
df.dropna(inplace=True)
val= 318
pos= df[(df["Opioid.Prescriber"]==1)]
neg= df[(df["Opioid.Prescriber"]==0)]
pos= pos.sample(n=9000)
neg= neg.sample(n=9000)
tot= pd.concat([neg, pos])
tot_special=tot.Specialty.unique()
tot_creds= tot.Credentials.unique()
tot.Gender= tot.Gender.replace(['M', 'F'], [1, 0])
tot.Credentials=tot.Credentials.replace(tot_creds, [i for i in range(len(tot_creds))])
tot.Specialty=tot.Specialty.replace(tot_special, [i for i in range(len(tot_special))])
X = np.array(tot.drop(['State', 'NPI', 'Opioid.Prescriber'], 1))
y = np.array(tot['Opioid.Prescriber'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
.format(
X_test.shape[0],
(y_test != y_pred).sum()- val,
100*(1-((y_test != y_pred).sum()-val)/X_test.shape[0])
))
def credSpecDict():
cred_dict={}
spec_dict={}
for i in range(len(tot_creds)):
cred_dict[i]= tot_creds[i]
for i in range(len(tot_special)):
spec_dict[i]= tot_special[i]
return cred_dict, spec_dict
def extractFeatures():
features= list(tot.columns)
removeFeatures= ['NPI', 'State', 'Opioid.Prescriber']
for i in features:
if i in removeFeatures:
features.remove(i)
return features
def returnPreds(row):
y=gnb.predict(row)
return int(y)