diff --git a/delivers/baseline-script.html b/delivers/baseline-script.html new file mode 100644 index 0000000..1aa34bd --- /dev/null +++ b/delivers/baseline-script.html @@ -0,0 +1,14209 @@ + + + +baseline-script + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+
+

Baseline script of San Francisco Crime Classification

Baseline script. Hope this helps.

+ +
+
+
+
+
+
In [1]:
+
+
+
import numpy as np
+import pandas as pd
+
+ +
+
+
+ +
+
+
+
+
+
+

Load datasets

+
+
+
+
+
+
In [2]:
+
+
+
train = pd.read_csv("../data/train.csv", parse_dates=["Dates"])
+
+train.drop("Resolution", axis=1, inplace=True)
+train.drop("Descript", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 7)
+
+
+
+ +
Out[2]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatesCategoryDayOfWeekPdDistrictAddressXY
02015-05-13 23:53:00WARRANTSWednesdayNORTHERNOAK ST / LAGUNA ST-122.42589237.774599
12015-05-13 23:53:00OTHER OFFENSESWednesdayNORTHERNOAK ST / LAGUNA ST-122.42589237.774599
22015-05-13 23:33:00OTHER OFFENSESWednesdayNORTHERNVANNESS AV / GREENWICH ST-122.42436337.800414
+
+
+ +
+ +
+
+ +
+
+
+
In [3]:
+
+
+
test = pd.read_csv("../data/test.csv", parse_dates=["Dates"])
+
+print(test.shape)
+test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 7)
+
+
+
+ +
Out[3]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IdDatesDayOfWeekPdDistrictAddressXY
002015-05-10 23:59:00SundayBAYVIEW2000 Block of THOMAS AV-122.39958837.735051
112015-05-10 23:51:00SundayBAYVIEW3RD ST / REVERE AV-122.39152337.732432
222015-05-10 23:50:00SundayNORTHERN2000 Block of GOUGH ST-122.42600237.792212
+
+
+ +
+ +
+
+ +
+
+
+
In [4]:
+
+
+
sample = pd.read_csv("../data/sampleSubmission.csv", index_col="Id")
+
+print(sample.shape)
+sample.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[4]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ARSONASSAULTBAD CHECKSBRIBERYBURGLARYDISORDERLY CONDUCTDRIVING UNDER THE INFLUENCEDRUG/NARCOTICDRUNKENNESSEMBEZZLEMENT...SEX OFFENSES NON FORCIBLESTOLEN PROPERTYSUICIDESUSPICIOUS OCCTREATRESPASSVANDALISMVEHICLE THEFTWARRANTSWEAPON LAWS
Id
00000000000...0000000010
10000000000...0000000010
20000000000...0000000010
+

3 rows × 39 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Feature Engineering

+
+
+
+
+
+
In [5]:
+
+
+
combi = pd.concat([train, test])
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 8)
+
+
+
+ +
Out[5]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesDayOfWeekIdPdDistrictXY
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00WednesdayNaNNORTHERN-122.42589237.774599
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00WednesdayNaNNORTHERN-122.42589237.774599
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00WednesdayNaNNORTHERN-122.42436337.800414
+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

DayOfWeek

+
+
+
+
+
+
In [6]:
+
+
+
print(combi["DayOfWeek"].unique())
+
+day_of_week_dataframe = pd.get_dummies(combi["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)
+print(day_of_week_dataframe.shape)
+day_of_week_dataframe.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
['Wednesday' 'Tuesday' 'Monday' 'Sunday' 'Saturday' 'Friday' 'Thursday']
+(1762311, 7)
+
+
+
+ +
Out[6]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday
0FalseFalseFalseFalseFalseFalseTrue
1FalseFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalseFalseTrue
+
+
+ +
+ +
+
+ +
+
+
+
In [7]:
+
+
+
combi = pd.concat([combi, day_of_week_dataframe], axis=1)
+combi.drop("DayOfWeek", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 14)
+
+
+
+ +
Out[7]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesIdPdDistrictXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00NaNNORTHERN-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00NaNNORTHERN-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00NaNNORTHERN-122.42436337.800414FalseFalseFalseFalseFalseFalseTrue
+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

PdDistrict

+
+
+
+
+
+
In [8]:
+
+
+
print(combi["PdDistrict"].unique())
+
+pd_district_dataframe = pd.get_dummies(combi["PdDistrict"], prefix="PdDistrict").astype(np.bool)
+
+print(pd_district_dataframe.shape)
+pd_district_dataframe.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
+ 'TENDERLOIN' 'MISSION' 'SOUTHERN']
+(1762311, 10)
+
+
+
+ +
Out[8]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PdDistrict_BAYVIEWPdDistrict_CENTRALPdDistrict_INGLESIDEPdDistrict_MISSIONPdDistrict_NORTHERNPdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOIN
0FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
1FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
2FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
+
+
+ +
+ +
+
+ +
+
+
+
In [9]:
+
+
+
combi = pd.concat([combi, pd_district_dataframe], axis=1)
+combi.drop("PdDistrict", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 23)
+
+
+
+ +
Out[9]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_Sunday...PdDistrict_BAYVIEWPdDistrict_CENTRALPdDistrict_INGLESIDEPdDistrict_MISSIONPdDistrict_NORTHERNPdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOIN
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00NaN-122.42589237.774599FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00NaN-122.42589237.774599FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00NaN-122.42436337.800414FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
+

3 rows × 23 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Dates

+
+
+
+
+
+
In [10]:
+
+
+
def get_season(x):
+    summer=0
+    fall=0
+    winter=0
+    spring=0
+    if (x in [5, 6, 7]):
+        summer=1
+    if (x in [8, 9, 10]):
+        fall=1
+    if (x in [11, 0, 1]):
+        winter=1
+    if (x in [2, 3, 4]):
+        spring=1
+    return summer, fall, winter, spring
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
combi["Dates_year"] = combi["Dates"].dt.year
+combi["Dates_month"] = combi["Dates"].dt.month
+combi["Dates_day"] = combi["Dates"].dt.day
+combi["Dates_hour"] = combi["Dates"].dt.hour
+combi["Dates_minute"] = combi["Dates"].dt.minute
+combi["Dates_second"] = combi["Dates"].dt.second
+combi["Awake"] = combi["Dates_hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
+combi["Summer"], combi["Fall"], combi["Winter"], combi["Spring"]=zip(*combi["Dates_month"].apply(get_season))
+
+combi.drop("Dates", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 33)
+
+
+
+ +
Out[11]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_Thursday...Dates_monthDates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpring
0OAK ST / LAGUNA STWARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
1OAK ST / LAGUNA STOTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
2VANNESS AV / GREENWICH STOTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalse...5132333011000
+

3 rows × 33 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Define a new feature named 'Address_Type' (Block/CrossRoad)

+
+
+
+
+
+
In [13]:
+
+
+
from sklearn.preprocessing import LabelEncoder
+
+combi["Address_Type"] = np.nan
+
+combi.loc[combi["Address"].str.contains("Block of"), "Address_Type"] = "Block"
+combi.loc[combi["Address"].str.contains("/"), "Address_Type"] = "CrossRoad"
+
+encoder = LabelEncoder()
+combi["Address_Type(encode)"] = encoder.fit_transform(combi["Address_Type"])
+
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
Out[13]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_Thursday...Dates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress_TypeAddress_Type(encode)
0OAK ST / LAGUNA STWARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalse...2353011000CrossRoad1
1OAK ST / LAGUNA STOTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalse...2353011000CrossRoad1
2VANNESS AV / GREENWICH STOTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalse...2333011000CrossRoad1
+

3 rows × 35 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Split to train / test dataset

+
+
+
+
+
+
In [14]:
+
+
+
combi.drop("Address", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 34)
+
+
+
+ +
Out[14]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_Tuesday...Dates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress_TypeAddress_Type(encode)
0WARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...2353011000CrossRoad1
1OTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...2353011000CrossRoad1
2OTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalseFalse...2333011000CrossRoad1
+

3 rows × 34 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [15]:
+
+
+
train = combi[combi["Category"].notnull()]
+
+train.drop("Id", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 33)
+
+
+
+ +
+
+
/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  app.launch_new_instance()
+
+
+
+ +
Out[15]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday...Dates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress_TypeAddress_Type(encode)
0WARRANTS-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue...2353011000CrossRoad1
1OTHER OFFENSES-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue...2353011000CrossRoad1
2OTHER OFFENSES-122.42436337.800414FalseFalseFalseFalseFalseFalseTrue...2333011000CrossRoad1
+

3 rows × 33 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [16]:
+
+
+
test = combi[combi["Category"].isnull()]
+
+test["Id"] = test["Id"].astype(np.int32)
+test.drop("Category", axis=1, inplace=True)
+
+test.set_index("Id", inplace=True)
+
+print(test.shape)
+test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  app.launch_new_instance()
+
+
+
+ +
+
+
(884262, 32)
+
+
+
+ +
+
+
/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+
+
+
+ +
Out[16]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
XYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_WednesdayPdDistrict_BAYVIEW...Dates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress_TypeAddress_Type(encode)
Id
0-122.39958837.735051FalseFalseFalseTrueFalseFalseFalseTrue...2359011000Block0
1-122.39152337.732432FalseFalseFalseTrueFalseFalseFalseTrue...2351011000CrossRoad1
2-122.42600237.792212FalseFalseFalseTrueFalseFalseFalseFalse...2350011000Block0
+

3 rows × 32 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Score

+
+
+
+
+
+
In [17]:
+
+
+
exclude_columns = ["Address_Type"]
+
+label_name = "Category"
+feature_names = train.columns.difference([label_name] + exclude_columns)
+
+X_train = train[feature_names]
+
+print(X_train.shape)
+X_train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 31)
+
+
+
+ +
Out[17]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Address_Type(encode)AwakeDates_dayDates_hourDates_minuteDates_monthDates_secondDates_yearDayOfWeek_FridayDayOfWeek_Monday...PdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOINSpringSummerWinterXY
011132353502015FalseFalse...FalseFalseFalseFalseFalse010-122.42589237.774599
111132353502015FalseFalse...FalseFalseFalseFalseFalse010-122.42589237.774599
211132333502015FalseFalse...FalseFalseFalseFalseFalse010-122.42436337.800414
+

3 rows × 31 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [18]:
+
+
+
y_train = train[label_name]
+
+print(y_train.shape)
+y_train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049,)
+
+
+
+ +
Out[18]:
+ + +
+
0          WARRANTS
+1    OTHER OFFENSES
+2    OTHER OFFENSES
+Name: Category, dtype: object
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Evaluate using Naive Bayes

+
+
+
+
+
+
In [19]:
+
+
+
from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+kfold = StratifiedKFold(y_train, n_folds=6)
+
+model = BernoulliNB()
+%time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()
+score = -1.0 * score
+
+print("Use BernoulliNB. Score = {0:.6f}".format(score))
+
+ +
+
+
+ +
+
+ + +
+
+
CPU times: user 48.3 s, sys: 1.86 s, total: 50.2 s
+Wall time: 48.9 s
+Use BernoulliNB. Score = 2.517539
+
+
+
+ +
+
+ +
+
+
+
+
+
+

Predict

+
+
+
+
+
+
In [20]:
+
+
+
X_test = test[feature_names]
+
+print(X_test.shape)
+X_test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 31)
+
+
+
+ +
Out[20]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Address_Type(encode)AwakeDates_dayDates_hourDates_minuteDates_monthDates_secondDates_yearDayOfWeek_FridayDayOfWeek_Monday...PdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOINSpringSummerWinterXY
Id
001102359502015FalseFalse...FalseFalseFalseFalseFalse010-122.39958837.735051
111102351502015FalseFalse...FalseFalseFalseFalseFalse010-122.39152337.732432
201102350502015FalseFalse...FalseFalseFalseFalseFalse010-122.42600237.792212
+

3 rows × 31 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [21]:
+
+
+
from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+model = BernoulliNB()
+model.fit(X_train, y_train)
+
+prediction = model.predict_proba(X_test)
+
+print(prediction.shape)
+prediction[:1]
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[21]:
+ + +
+
array([[  6.15661689e-03,   1.67585639e-01,   1.12483697e-04,
+          8.81713446e-04,   4.06910454e-02,   2.18988921e-03,
+          1.55177863e-03,   4.67223805e-02,   4.55686884e-03,
+          6.30590722e-04,   1.89187179e-04,   8.40936473e-04,
+          5.85516677e-03,   6.87678483e-03,   3.93153520e-04,
+          4.27555899e-03,   9.08472440e-02,   1.59855130e-03,
+          3.27831235e-04,   5.86817876e-02,   7.25539371e-02,
+          1.53244251e-01,   7.10261876e-06,   1.05155595e-04,
+          9.88284415e-03,   2.89483261e-02,   2.94895460e-03,
+          2.85398838e-02,   4.64345647e-03,   1.71533199e-04,
+          5.14001860e-03,   6.48117991e-04,   5.17240980e-02,
+          1.00059338e-06,   9.78585378e-03,   6.09758187e-02,
+          5.12109408e-02,   5.34540385e-02,   2.50494596e-02]])
+
+ +
+ +
+
+ +
+
+
+
In [22]:
+
+
+
submission = pd.DataFrame(prediction, index=X_test.index, columns = sample.columns)
+submission = submission.reindex_axis(sorted(submission.columns), axis=1,)
+
+print(submission.shape)
+submission.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[22]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ARSONASSAULTBAD CHECKSBRIBERYBURGLARYDISORDERLY CONDUCTDRIVING UNDER THE INFLUENCEDRUG/NARCOTICDRUNKENNESSEMBEZZLEMENT...SEX OFFENSES NON FORCIBLESTOLEN PROPERTYSUICIDESUSPICIOUS OCCTREATRESPASSVANDALISMVEHICLE THEFTWARRANTSWEAPON LAWS
Id
00.0061570.1675860.0001120.0008820.0406910.0021900.0015520.0467220.0045570.000631...0.0001720.0051400.0006480.0517241.000593e-060.0097860.0609760.0512110.0534540.025049
10.0028740.0994070.0000150.0003220.0011640.0021450.0061000.0645580.0052180.000037...0.0000620.0042860.0000590.0272023.376355e-070.0011660.0410000.0692800.0642540.024009
20.0016880.1212570.0001640.0002220.0564680.0041930.0020530.0411090.0062370.000744...0.0000590.0085670.0010860.0371242.925399e-070.0107550.0540270.0383790.0502560.009369
+

3 rows × 39 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [24]:
+
+
+
from datetime import datetime
+
+current_time = datetime.now()
+current_time = current_time.strftime("%Y%m%d%H%M%S")
+
+description = "add-address-type"
+csv_filename = "../submissions/" + current_time + "_" + description + ".csv"
+
+submission.to_csv(csv_filename)
+
+ +
+
+
+ +
+
+
+
In [25]:
+
+
+
import gzip
+
+gzip_filename = csv_filename + ".gz"
+
+f_in = open(csv_filename, "rb")
+
+f_out = gzip.open(gzip_filename, 'wb')
+f_out.writelines(f_in)
+f_out.close()
+
+f_in.close()
+
+ +
+
+
+ +
+
+
+ + diff --git a/delivers/baseline-script.ipynb b/delivers/baseline-script.ipynb index 3f83c5b..cb68004 100644 --- a/delivers/baseline-script.ipynb +++ b/delivers/baseline-script.ipynb @@ -1220,6 +1220,177 @@ "combi.head(3)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a new feature named 'Address_Type' (Block/CrossRoad)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AddressCategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_Thursday...Dates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress_TypeAddress_Type(encode)
0OAK ST / LAGUNA STWARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalse...2353011000CrossRoad1
1OAK ST / LAGUNA STOTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalse...2353011000CrossRoad1
2VANNESS AV / GREENWICH STOTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalse...2333011000CrossRoad1
\n", + "

3 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " Address Category Id X Y \\\n", + "0 OAK ST / LAGUNA ST WARRANTS NaN -122.425892 37.774599 \n", + "1 OAK ST / LAGUNA ST OTHER OFFENSES NaN -122.425892 37.774599 \n", + "2 VANNESS AV / GREENWICH ST OTHER OFFENSES NaN -122.424363 37.800414 \n", + "\n", + " DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday DayOfWeek_Sunday \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "\n", + " DayOfWeek_Thursday ... Dates_hour Dates_minute \\\n", + "0 False ... 23 53 \n", + "1 False ... 23 53 \n", + "2 False ... 23 33 \n", + "\n", + " Dates_second Awake Summer Fall Winter Spring Address_Type \\\n", + "0 0 1 1 0 0 0 CrossRoad \n", + "1 0 1 1 0 0 0 CrossRoad \n", + "2 0 1 1 0 0 0 CrossRoad \n", + "\n", + " Address_Type(encode) \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "\n", + "[3 rows x 35 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "combi[\"Address_Type\"] = np.nan\n", + "\n", + "combi.loc[combi[\"Address\"].str.contains(\"Block of\"), \"Address_Type\"] = \"Block\"\n", + "combi.loc[combi[\"Address\"].str.contains(\"/\"), \"Address_Type\"] = \"CrossRoad\"\n", + "\n", + "encoder = LabelEncoder()\n", + "combi[\"Address_Type(encode)\"] = encoder.fit_transform(combi[\"Address_Type\"])\n", + "\n", + "combi.head(3)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1229,7 +1400,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -1238,7 +1409,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(1762311, 32)\n" + "(1762311, 34)\n" ] }, { @@ -1260,8 +1431,6 @@ " DayOfWeek_Thursday\n", " DayOfWeek_Tuesday\n", " ...\n", - " Dates_month\n", - " Dates_day\n", " Dates_hour\n", " Dates_minute\n", " Dates_second\n", @@ -1270,6 +1439,8 @@ " Fall\n", " Winter\n", " Spring\n", + " Address_Type\n", + " Address_Type(encode)\n", " \n", " \n", " \n", @@ -1286,8 +1457,6 @@ " False\n", " False\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 53\n", " 0\n", @@ -1296,6 +1465,8 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", " 1\n", @@ -1310,8 +1481,6 @@ " False\n", " False\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 53\n", " 0\n", @@ -1320,6 +1489,8 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", " 2\n", @@ -1334,8 +1505,6 @@ " False\n", " False\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 33\n", " 0\n", @@ -1344,10 +1513,12 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", "\n", - "

3 rows × 32 columns

\n", + "

3 rows × 34 columns

\n", "" ], "text/plain": [ @@ -1361,20 +1532,20 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Tuesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 False ... 5 13 23 53 \n", - "1 False ... 5 13 23 53 \n", - "2 False ... 5 13 23 33 \n", + " DayOfWeek_Tuesday ... Dates_hour Dates_minute Dates_second \\\n", + "0 False ... 23 53 0 \n", + "1 False ... 23 53 0 \n", + "2 False ... 23 33 0 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Awake Summer Fall Winter Spring Address_Type Address_Type(encode) \n", + "0 1 1 0 0 0 CrossRoad 1 \n", + "1 1 1 0 0 0 CrossRoad 1 \n", + "2 1 1 0 0 0 CrossRoad 1 \n", "\n", - "[3 rows x 32 columns]" + "[3 rows x 34 columns]" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1388,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -1397,14 +1568,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 31)\n" + "(878049, 33)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", @@ -1430,8 +1601,6 @@ " DayOfWeek_Tuesday\n", " DayOfWeek_Wednesday\n", " ...\n", - " Dates_month\n", - " Dates_day\n", " Dates_hour\n", " Dates_minute\n", " Dates_second\n", @@ -1440,6 +1609,8 @@ " Fall\n", " Winter\n", " Spring\n", + " Address_Type\n", + " Address_Type(encode)\n", " \n", " \n", " \n", @@ -1456,8 +1627,6 @@ " False\n", " True\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 53\n", " 0\n", @@ -1466,6 +1635,8 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", " 1\n", @@ -1480,8 +1651,6 @@ " False\n", " True\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 53\n", " 0\n", @@ -1490,6 +1659,8 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", " 2\n", @@ -1504,8 +1675,6 @@ " False\n", " True\n", " ...\n", - " 5\n", - " 13\n", " 23\n", " 33\n", " 0\n", @@ -1514,10 +1683,12 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", "\n", - "

3 rows × 31 columns

\n", + "

3 rows × 33 columns

\n", "" ], "text/plain": [ @@ -1531,20 +1702,25 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Wednesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 True ... 5 13 23 53 \n", - "1 True ... 5 13 23 53 \n", - "2 True ... 5 13 23 33 \n", + " DayOfWeek_Wednesday ... Dates_hour Dates_minute \\\n", + "0 True ... 23 53 \n", + "1 True ... 23 53 \n", + "2 True ... 23 33 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Dates_second Awake Summer Fall Winter Spring Address_Type \\\n", + "0 0 1 1 0 0 0 CrossRoad \n", + "1 0 1 1 0 0 0 CrossRoad \n", + "2 0 1 1 0 0 0 CrossRoad \n", "\n", - "[3 rows x 31 columns]" + " Address_Type(encode) \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "\n", + "[3 rows x 33 columns]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1560,29 +1736,35 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " app.launch_new_instance()\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 32)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " app.launch_new_instance()\n", - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" @@ -1607,8 +1789,6 @@ " DayOfWeek_Wednesday\n", " PdDistrict_BAYVIEW\n", " ...\n", - " Dates_month\n", - " Dates_day\n", " Dates_hour\n", " Dates_minute\n", " Dates_second\n", @@ -1617,6 +1797,8 @@ " Fall\n", " Winter\n", " Spring\n", + " Address_Type\n", + " Address_Type(encode)\n", " \n", " \n", " Id\n", @@ -1657,8 +1839,6 @@ " False\n", " True\n", " ...\n", - " 5\n", - " 10\n", " 23\n", " 59\n", " 0\n", @@ -1667,6 +1847,8 @@ " 0\n", " 0\n", " 0\n", + " Block\n", + " 0\n", " \n", " \n", " 1\n", @@ -1681,8 +1863,6 @@ " False\n", " True\n", " ...\n", - " 5\n", - " 10\n", " 23\n", " 51\n", " 0\n", @@ -1691,6 +1871,8 @@ " 0\n", " 0\n", " 0\n", + " CrossRoad\n", + " 1\n", " \n", " \n", " 2\n", @@ -1705,8 +1887,6 @@ " False\n", " False\n", " ...\n", - " 5\n", - " 10\n", " 23\n", " 50\n", " 0\n", @@ -1715,10 +1895,12 @@ " 0\n", " 0\n", " 0\n", + " Block\n", + " 0\n", " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 32 columns

\n", "" ], "text/plain": [ @@ -1734,22 +1916,28 @@ "1 False True False False \n", "2 False True False False \n", "\n", - " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_month Dates_day \\\n", - "Id ... \n", - "0 False True ... 5 10 \n", - "1 False True ... 5 10 \n", - "2 False False ... 5 10 \n", + " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_hour \\\n", + "Id ... \n", + "0 False True ... 23 \n", + "1 False True ... 23 \n", + "2 False False ... 23 \n", "\n", - " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \n", - "Id \n", - "0 23 59 0 1 1 0 0 0 \n", - "1 23 51 0 1 1 0 0 0 \n", - "2 23 50 0 1 1 0 0 0 \n", + " Dates_minute Dates_second Awake Summer Fall Winter Spring Address_Type \\\n", + "Id \n", + "0 59 0 1 1 0 0 0 Block \n", + "1 51 0 1 1 0 0 0 CrossRoad \n", + "2 50 0 1 1 0 0 0 Block \n", + "\n", + " Address_Type(encode) \n", + "Id \n", + "0 0 \n", + "1 1 \n", + "2 0 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 32 columns]" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1775,7 +1963,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -1784,7 +1972,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 30)\n" + "(878049, 31)\n" ] }, { @@ -1795,6 +1983,7 @@ " \n", " \n", " \n", + " Address_Type(encode)\n", " Awake\n", " Dates_day\n", " Dates_hour\n", @@ -1804,7 +1993,6 @@ " Dates_year\n", " DayOfWeek_Friday\n", " DayOfWeek_Monday\n", - " DayOfWeek_Saturday\n", " ...\n", " PdDistrict_PARK\n", " PdDistrict_RICHMOND\n", @@ -1822,6 +2010,7 @@ " \n", " 0\n", " 1\n", + " 1\n", " 13\n", " 23\n", " 53\n", @@ -1830,7 +2019,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1846,6 +2034,7 @@ " \n", " 1\n", " 1\n", + " 1\n", " 13\n", " 23\n", " 53\n", @@ -1854,7 +2043,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1870,6 +2058,7 @@ " \n", " 2\n", " 1\n", + " 1\n", " 13\n", " 23\n", " 33\n", @@ -1878,7 +2067,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1893,41 +2081,48 @@ " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 31 columns

\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "0 1 13 23 53 5 0 \n", - "1 1 13 23 53 5 0 \n", - "2 1 13 23 33 5 0 \n", + " Address_Type(encode) Awake Dates_day Dates_hour Dates_minute \\\n", + "0 1 1 13 23 53 \n", + "1 1 1 13 23 53 \n", + "2 1 1 13 23 33 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday ... \\\n", - "0 2015 False False False ... \n", - "1 2015 False False False ... \n", - "2 2015 False False False ... \n", + " Dates_month Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday \\\n", + "0 5 0 2015 False False \n", + "1 5 0 2015 False False \n", + "2 5 0 2015 False False \n", "\n", - " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", - "0 False False False False \n", - "1 False False False False \n", - "2 False False False False \n", + " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", + "0 ... False False False \n", + "1 ... False False False \n", + "2 ... False False False \n", "\n", - " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", - "0 False 0 1 0 -122.425892 37.774599 \n", - "1 False 0 1 0 -122.425892 37.774599 \n", - "2 False 0 1 0 -122.424363 37.800414 \n", + " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + "0 False False 0 1 0 -122.425892 \n", + "1 False False 0 1 0 -122.425892 \n", + "2 False False 0 1 0 -122.424363 \n", "\n", - "[3 rows x 30 columns]" + " Y \n", + "0 37.774599 \n", + "1 37.774599 \n", + "2 37.800414 \n", + "\n", + "[3 rows x 31 columns]" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "exclude_columns = [\"Address_Type\"]\n", + "\n", "label_name = \"Category\"\n", - "feature_names = train.columns.difference([label_name])\n", + "feature_names = train.columns.difference([label_name] + exclude_columns)\n", "\n", "X_train = train[feature_names]\n", "\n", @@ -1937,7 +2132,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -1958,7 +2153,7 @@ "Name: Category, dtype: object" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1979,7 +2174,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "collapsed": false }, @@ -1988,9 +2183,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 55.2 s, sys: 5.56 s, total: 1min\n", - "Wall time: 1min\n", - "Use BernoulliNB. Score = 2.562140\n" + "CPU times: user 48.3 s, sys: 1.86 s, total: 50.2 s\n", + "Wall time: 48.9 s\n", + "Use BernoulliNB. Score = 2.517539\n" ] } ], @@ -2016,7 +2211,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -2025,7 +2220,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 31)\n" ] }, { @@ -2036,6 +2231,7 @@ " \n", " \n", " \n", + " Address_Type(encode)\n", " Awake\n", " Dates_day\n", " Dates_hour\n", @@ -2045,7 +2241,6 @@ " Dates_year\n", " DayOfWeek_Friday\n", " DayOfWeek_Monday\n", - " DayOfWeek_Saturday\n", " ...\n", " PdDistrict_PARK\n", " PdDistrict_RICHMOND\n", @@ -2086,6 +2281,7 @@ " \n", " \n", " 0\n", + " 0\n", " 1\n", " 10\n", " 23\n", @@ -2095,7 +2291,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2111,6 +2306,7 @@ " \n", " 1\n", " 1\n", + " 1\n", " 10\n", " 23\n", " 51\n", @@ -2119,7 +2315,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2134,6 +2329,7 @@ " \n", " \n", " 2\n", + " 0\n", " 1\n", " 10\n", " 23\n", @@ -2143,7 +2339,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2158,21 +2353,21 @@ " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 31 columns

\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "Id \n", - "0 1 10 23 59 5 0 \n", - "1 1 10 23 51 5 0 \n", - "2 1 10 23 50 5 0 \n", + " Address_Type(encode) Awake Dates_day Dates_hour Dates_minute \\\n", + "Id \n", + "0 0 1 10 23 59 \n", + "1 1 1 10 23 51 \n", + "2 0 1 10 23 50 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday \\\n", - "Id \n", - "0 2015 False False False \n", - "1 2015 False False False \n", - "2 2015 False False False \n", + " Dates_month Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday \\\n", + "Id \n", + "0 5 0 2015 False False \n", + "1 5 0 2015 False False \n", + "2 5 0 2015 False False \n", "\n", " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", "Id ... \n", @@ -2180,11 +2375,11 @@ "1 ... False False False \n", "2 ... False False False \n", "\n", - " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", "Id \n", - "0 False False 0 1 0 -122.399588 \n", - "1 False False 0 1 0 -122.391523 \n", - "2 False False 0 1 0 -122.426002 \n", + "0 False False 0 1 0 -122.399588 \n", + "1 False False 0 1 0 -122.391523 \n", + "2 False False 0 1 0 -122.426002 \n", "\n", " Y \n", "Id \n", @@ -2192,10 +2387,10 @@ "1 37.732432 \n", "2 37.792212 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2404,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -2224,22 +2419,22 @@ { "data": { "text/plain": [ - "array([[ 5.18022914e-03, 1.47306845e-01, 8.35309027e-05,\n", - " 7.15327047e-04, 2.89343463e-02, 2.17660902e-03,\n", - " 2.90469652e-03, 5.20272506e-02, 4.75338148e-03,\n", - " 4.53910576e-04, 1.50404661e-04, 6.33194320e-04,\n", - " 4.46497596e-03, 5.83197476e-03, 3.21888082e-04,\n", - " 3.66372241e-03, 9.46729466e-02, 1.63443121e-03,\n", - " 6.10377668e-04, 4.41001971e-02, 6.88112197e-02,\n", - " 2.06289853e-01, 6.30319698e-06, 3.51815041e-04,\n", - " 8.06585264e-03, 3.64767735e-02, 2.11525827e-03,\n", - " 2.31851049e-02, 3.90020973e-03, 1.39053149e-04,\n", - " 4.88590719e-03, 4.72963357e-04, 4.44303697e-02,\n", - " 8.03404588e-07, 7.22208250e-03, 5.50342776e-02,\n", - " 5.65854147e-02, 5.66664063e-02, 2.47400928e-02]])" + "array([[ 6.15661689e-03, 1.67585639e-01, 1.12483697e-04,\n", + " 8.81713446e-04, 4.06910454e-02, 2.18988921e-03,\n", + " 1.55177863e-03, 4.67223805e-02, 4.55686884e-03,\n", + " 6.30590722e-04, 1.89187179e-04, 8.40936473e-04,\n", + " 5.85516677e-03, 6.87678483e-03, 3.93153520e-04,\n", + " 4.27555899e-03, 9.08472440e-02, 1.59855130e-03,\n", + " 3.27831235e-04, 5.86817876e-02, 7.25539371e-02,\n", + " 1.53244251e-01, 7.10261876e-06, 1.05155595e-04,\n", + " 9.88284415e-03, 2.89483261e-02, 2.94895460e-03,\n", + " 2.85398838e-02, 4.64345647e-03, 1.71533199e-04,\n", + " 5.14001860e-03, 6.48117991e-04, 5.17240980e-02,\n", + " 1.00059338e-06, 9.78585378e-03, 6.09758187e-02,\n", + " 5.12109408e-02, 5.34540385e-02, 2.50494596e-02]])" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2259,7 +2454,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -2329,75 +2524,75 @@ " \n", " \n", " 0\n", - " 0.005180\n", - " 0.147307\n", - " 0.000084\n", - " 0.000715\n", - " 0.028934\n", - " 0.002177\n", - " 0.002905\n", - " 0.052027\n", - " 0.004753\n", - " 0.000454\n", + " 0.006157\n", + " 0.167586\n", + " 0.000112\n", + " 0.000882\n", + " 0.040691\n", + " 0.002190\n", + " 0.001552\n", + " 0.046722\n", + " 0.004557\n", + " 0.000631\n", " ...\n", - " 0.000139\n", - " 0.004886\n", - " 0.000473\n", - " 0.044430\n", - " 8.034046e-07\n", - " 0.007222\n", - " 0.055034\n", - " 0.056585\n", - " 0.056666\n", - " 0.024740\n", + " 0.000172\n", + " 0.005140\n", + " 0.000648\n", + " 0.051724\n", + " 1.000593e-06\n", + " 0.009786\n", + " 0.060976\n", + " 0.051211\n", + " 0.053454\n", + " 0.025049\n", " \n", " \n", " 1\n", - " 0.005180\n", - " 0.147307\n", - " 0.000084\n", - " 0.000715\n", - " 0.028934\n", - " 0.002177\n", - " 0.002905\n", - " 0.052027\n", - " 0.004753\n", - " 0.000454\n", + " 0.002874\n", + " 0.099407\n", + " 0.000015\n", + " 0.000322\n", + " 0.001164\n", + " 0.002145\n", + " 0.006100\n", + " 0.064558\n", + " 0.005218\n", + " 0.000037\n", " ...\n", - " 0.000139\n", - " 0.004886\n", - " 0.000473\n", - " 0.044430\n", - " 8.034046e-07\n", - " 0.007222\n", - " 0.055034\n", - " 0.056585\n", - " 0.056666\n", - " 0.024740\n", + " 0.000062\n", + " 0.004286\n", + " 0.000059\n", + " 0.027202\n", + " 3.376355e-07\n", + " 0.001166\n", + " 0.041000\n", + " 0.069280\n", + " 0.064254\n", + " 0.024009\n", " \n", " \n", " 2\n", - " 0.001412\n", - " 0.105913\n", - " 0.000121\n", - " 0.000179\n", - " 0.039900\n", - " 0.004142\n", - " 0.003819\n", - " 0.045489\n", - " 0.006465\n", - " 0.000533\n", + " 0.001688\n", + " 0.121257\n", + " 0.000164\n", + " 0.000222\n", + " 0.056468\n", + " 0.004193\n", + " 0.002053\n", + " 0.041109\n", + " 0.006237\n", + " 0.000744\n", " ...\n", - " 0.000048\n", - " 0.008092\n", - " 0.000788\n", - " 0.031688\n", - " 2.334092e-07\n", - " 0.007887\n", - " 0.048455\n", - " 0.042140\n", - " 0.052941\n", - " 0.009195\n", + " 0.000059\n", + " 0.008567\n", + " 0.001086\n", + " 0.037124\n", + " 2.925399e-07\n", + " 0.010755\n", + " 0.054027\n", + " 0.038379\n", + " 0.050256\n", + " 0.009369\n", " \n", " \n", "\n", @@ -2407,38 +2602,38 @@ "text/plain": [ " ARSON ASSAULT BAD CHECKS BRIBERY BURGLARY DISORDERLY CONDUCT \\\n", "Id \n", - "0 0.005180 0.147307 0.000084 0.000715 0.028934 0.002177 \n", - "1 0.005180 0.147307 0.000084 0.000715 0.028934 0.002177 \n", - "2 0.001412 0.105913 0.000121 0.000179 0.039900 0.004142 \n", + "0 0.006157 0.167586 0.000112 0.000882 0.040691 0.002190 \n", + "1 0.002874 0.099407 0.000015 0.000322 0.001164 0.002145 \n", + "2 0.001688 0.121257 0.000164 0.000222 0.056468 0.004193 \n", "\n", " DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC DRUNKENNESS EMBEZZLEMENT \\\n", "Id \n", - "0 0.002905 0.052027 0.004753 0.000454 \n", - "1 0.002905 0.052027 0.004753 0.000454 \n", - "2 0.003819 0.045489 0.006465 0.000533 \n", + "0 0.001552 0.046722 0.004557 0.000631 \n", + "1 0.006100 0.064558 0.005218 0.000037 \n", + "2 0.002053 0.041109 0.006237 0.000744 \n", "\n", " ... SEX OFFENSES NON FORCIBLE STOLEN PROPERTY SUICIDE \\\n", "Id ... \n", - "0 ... 0.000139 0.004886 0.000473 \n", - "1 ... 0.000139 0.004886 0.000473 \n", - "2 ... 0.000048 0.008092 0.000788 \n", + "0 ... 0.000172 0.005140 0.000648 \n", + "1 ... 0.000062 0.004286 0.000059 \n", + "2 ... 0.000059 0.008567 0.001086 \n", "\n", " SUSPICIOUS OCC TREA TRESPASS VANDALISM VEHICLE THEFT \\\n", "Id \n", - "0 0.044430 8.034046e-07 0.007222 0.055034 0.056585 \n", - "1 0.044430 8.034046e-07 0.007222 0.055034 0.056585 \n", - "2 0.031688 2.334092e-07 0.007887 0.048455 0.042140 \n", + "0 0.051724 1.000593e-06 0.009786 0.060976 0.051211 \n", + "1 0.027202 3.376355e-07 0.001166 0.041000 0.069280 \n", + "2 0.037124 2.925399e-07 0.010755 0.054027 0.038379 \n", "\n", " WARRANTS WEAPON LAWS \n", "Id \n", - "0 0.056666 0.024740 \n", - "1 0.056666 0.024740 \n", - "2 0.052941 0.009195 \n", + "0 0.053454 0.025049 \n", + "1 0.064254 0.024009 \n", + "2 0.050256 0.009369 \n", "\n", "[3 rows x 39 columns]" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2453,7 +2648,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": { "collapsed": false }, @@ -2464,14 +2659,15 @@ "current_time = datetime.now()\n", "current_time = current_time.strftime(\"%Y%m%d%H%M%S\")\n", "\n", - "csv_filename = \"../submissions/\" + current_time + \"_\" + \"baseline_script.csv\"\n", + "description = \"add-address-type\"\n", + "csv_filename = \"../submissions/\" + current_time + \"_\" + description + \".csv\"\n", "\n", "submission.to_csv(csv_filename)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": { "collapsed": false }, @@ -2494,21 +2690,21 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [Root]", + "display_name": "Python 3", "language": "python", - "name": "Python [Root]" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/delivers/baseline-script.py b/delivers/baseline-script.py index 3fc7b34..0854b0f 100644 --- a/delivers/baseline-script.py +++ b/delivers/baseline-script.py @@ -128,9 +128,26 @@ def get_season(x): combi.head(3) +# ### Define a new feature named 'Address_Type' (Block/CrossRoad) + +# In[13]: + +from sklearn.preprocessing import LabelEncoder + +combi["Address_Type"] = np.nan + +combi.loc[combi["Address"].str.contains("Block of"), "Address_Type"] = "Block" +combi.loc[combi["Address"].str.contains("/"), "Address_Type"] = "CrossRoad" + +encoder = LabelEncoder() +combi["Address_Type(encode)"] = encoder.fit_transform(combi["Address_Type"]) + +combi.head(3) + + # ### Split to train / test dataset -# In[12]: +# In[14]: combi.drop("Address", axis=1, inplace=True) @@ -138,7 +155,7 @@ def get_season(x): combi.head(3) -# In[13]: +# In[15]: train = combi[combi["Category"].notnull()] @@ -148,7 +165,7 @@ def get_season(x): train.head(3) -# In[14]: +# In[16]: test = combi[combi["Category"].isnull()] @@ -163,10 +180,12 @@ def get_season(x): # ## Score -# In[15]: +# In[17]: + +exclude_columns = ["Address_Type"] label_name = "Category" -feature_names = train.columns.difference([label_name]) +feature_names = train.columns.difference([label_name] + exclude_columns) X_train = train[feature_names] @@ -174,7 +193,7 @@ def get_season(x): X_train.head(3) -# In[16]: +# In[18]: y_train = train[label_name] @@ -184,7 +203,7 @@ def get_season(x): # ### Evaluate using Naive Bayes -# In[17]: +# In[19]: from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import cross_val_score, StratifiedKFold @@ -192,7 +211,7 @@ def get_season(x): kfold = StratifiedKFold(y_train, n_folds=6) model = BernoulliNB() -get_ipython().magic(u"time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()") +get_ipython().magic("time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()") score = -1.0 * score print("Use BernoulliNB. Score = {0:.6f}".format(score)) @@ -200,7 +219,7 @@ def get_season(x): # ## Predict -# In[18]: +# In[20]: X_test = test[feature_names] @@ -208,7 +227,7 @@ def get_season(x): X_test.head(3) -# In[19]: +# In[21]: from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import cross_val_score, StratifiedKFold @@ -222,7 +241,7 @@ def get_season(x): prediction[:1] -# In[20]: +# In[22]: submission = pd.DataFrame(prediction, index=X_test.index, columns = sample.columns) submission = submission.reindex_axis(sorted(submission.columns), axis=1,) @@ -231,19 +250,20 @@ def get_season(x): submission.head(3) -# In[21]: +# In[24]: from datetime import datetime current_time = datetime.now() current_time = current_time.strftime("%Y%m%d%H%M%S") -csv_filename = "../submissions/" + current_time + "_" + "baseline_script.csv" +description = "add-address-type" +csv_filename = "../submissions/" + current_time + "_" + description + ".csv" submission.to_csv(csv_filename) -# In[22]: +# In[25]: import gzip