diff --git a/delivers/baseline-script.html b/delivers/baseline-script.html new file mode 100644 index 0000000..395f971 --- /dev/null +++ b/delivers/baseline-script.html @@ -0,0 +1,14409 @@ + + + +baseline-script + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+
+

Baseline script of San Francisco Crime Classification

Baseline script. Hope this helps.

+ +
+
+
+
+
+
In [1]:
+
+
+
import numpy as np
+import pandas as pd
+
+ +
+
+
+ +
+
+
+
+
+
+

Load datasets

+
+
+
+
+
+
In [2]:
+
+
+
train = pd.read_csv("../data/train.csv", parse_dates=["Dates"])
+
+train.drop("Resolution", axis=1, inplace=True)
+train.drop("Descript", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 7)
+
+
+
+ +
Out[2]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatesCategoryDayOfWeekPdDistrictAddressXY
02015-05-13 23:53:00WARRANTSWednesdayNORTHERNOAK ST / LAGUNA ST-122.42589237.774599
12015-05-13 23:53:00OTHER OFFENSESWednesdayNORTHERNOAK ST / LAGUNA ST-122.42589237.774599
22015-05-13 23:33:00OTHER OFFENSESWednesdayNORTHERNVANNESS AV / GREENWICH ST-122.42436337.800414
+
+
+ +
+ +
+
+ +
+
+
+
In [3]:
+
+
+
test = pd.read_csv("../data/test.csv", parse_dates=["Dates"])
+
+print(test.shape)
+test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 7)
+
+
+
+ +
Out[3]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IdDatesDayOfWeekPdDistrictAddressXY
002015-05-10 23:59:00SundayBAYVIEW2000 Block of THOMAS AV-122.39958837.735051
112015-05-10 23:51:00SundayBAYVIEW3RD ST / REVERE AV-122.39152337.732432
222015-05-10 23:50:00SundayNORTHERN2000 Block of GOUGH ST-122.42600237.792212
+
+
+ +
+ +
+
+ +
+
+
+
In [4]:
+
+
+
sample = pd.read_csv("../data/sampleSubmission.csv", index_col="Id")
+
+print(sample.shape)
+sample.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[4]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ARSONASSAULTBAD CHECKSBRIBERYBURGLARYDISORDERLY CONDUCTDRIVING UNDER THE INFLUENCEDRUG/NARCOTICDRUNKENNESSEMBEZZLEMENT...SEX OFFENSES NON FORCIBLESTOLEN PROPERTYSUICIDESUSPICIOUS OCCTREATRESPASSVANDALISMVEHICLE THEFTWARRANTSWEAPON LAWS
Id
00000000000...0000000010
10000000000...0000000010
20000000000...0000000010
+

3 rows × 39 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Feature Engineering

+
+
+
+
+
+
In [5]:
+
+
+
combi = pd.concat([train, test])
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 8)
+
+
+
+ +
Out[5]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesDayOfWeekIdPdDistrictXY
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00WednesdayNaNNORTHERN-122.42589237.774599
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00WednesdayNaNNORTHERN-122.42589237.774599
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00WednesdayNaNNORTHERN-122.42436337.800414
+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

DayOfWeek

+
+
+
+
+
+
In [6]:
+
+
+
print(combi["DayOfWeek"].unique())
+
+day_of_week_dataframe = pd.get_dummies(combi["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)
+print(day_of_week_dataframe.shape)
+day_of_week_dataframe.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
['Wednesday' 'Tuesday' 'Monday' 'Sunday' 'Saturday' 'Friday' 'Thursday']
+(1762311, 7)
+
+
+
+ +
Out[6]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday
0FalseFalseFalseFalseFalseFalseTrue
1FalseFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalseFalseTrue
+
+
+ +
+ +
+
+ +
+
+
+
In [7]:
+
+
+
combi = pd.concat([combi, day_of_week_dataframe], axis=1)
+combi.drop("DayOfWeek", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 14)
+
+
+
+ +
Out[7]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesIdPdDistrictXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00NaNNORTHERN-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00NaNNORTHERN-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00NaNNORTHERN-122.42436337.800414FalseFalseFalseFalseFalseFalseTrue
+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

PdDistrict

+
+
+
+
+
+
In [8]:
+
+
+
print(combi["PdDistrict"].unique())
+
+pd_district_dataframe = pd.get_dummies(combi["PdDistrict"], prefix="PdDistrict").astype(np.bool)
+
+print(pd_district_dataframe.shape)
+pd_district_dataframe.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
+ 'TENDERLOIN' 'MISSION' 'SOUTHERN']
+(1762311, 10)
+
+
+
+ +
Out[8]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PdDistrict_BAYVIEWPdDistrict_CENTRALPdDistrict_INGLESIDEPdDistrict_MISSIONPdDistrict_NORTHERNPdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOIN
0FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
1FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
2FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
+
+
+ +
+ +
+
+ +
+
+
+
In [9]:
+
+
+
combi = pd.concat([combi, pd_district_dataframe], axis=1)
+combi.drop("PdDistrict", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 23)
+
+
+
+ +
Out[9]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryDatesIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_Sunday...PdDistrict_BAYVIEWPdDistrict_CENTRALPdDistrict_INGLESIDEPdDistrict_MISSIONPdDistrict_NORTHERNPdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOIN
0OAK ST / LAGUNA STWARRANTS2015-05-13 23:53:00NaN-122.42589237.774599FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
1OAK ST / LAGUNA STOTHER OFFENSES2015-05-13 23:53:00NaN-122.42589237.774599FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
2VANNESS AV / GREENWICH STOTHER OFFENSES2015-05-13 23:33:00NaN-122.42436337.800414FalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
+

3 rows × 23 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Dates

+
+
+
+
+
+
In [10]:
+
+
+
def get_season(x):
+    summer=0
+    fall=0
+    winter=0
+    spring=0
+    if (x in [5, 6, 7]):
+        summer=1
+    if (x in [8, 9, 10]):
+        fall=1
+    if (x in [11, 0, 1]):
+        winter=1
+    if (x in [2, 3, 4]):
+        spring=1
+    return summer, fall, winter, spring
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
combi["Dates_year"] = combi["Dates"].dt.year
+combi["Dates_month"] = combi["Dates"].dt.month
+combi["Dates_day"] = combi["Dates"].dt.day
+combi["Dates_hour"] = combi["Dates"].dt.hour
+combi["Dates_minute"] = combi["Dates"].dt.minute
+combi["Dates_second"] = combi["Dates"].dt.second
+combi["Awake"] = combi["Dates_hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
+combi["Summer"], combi["Fall"], combi["Winter"], combi["Spring"]=zip(*combi["Dates_month"].apply(get_season))
+
+combi.drop("Dates", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 33)
+
+
+
+ +
Out[11]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_Thursday...Dates_monthDates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpring
0OAK ST / LAGUNA STWARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
1OAK ST / LAGUNA STOTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
2VANNESS AV / GREENWICH STOTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalse...5132333011000
+

3 rows × 33 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

One hot encode address

+
+
+
+
+
+
In [12]:
+
+
+
# 누적값이 200개 이하인 경우는 'Others'로 바꾼다.
+address_counts = combi["Address"].value_counts()
+other_index = address_counts[address_counts < 200].index
+combi.loc[combi["Address"].isin(other_index), "Address"] = "Others"
+
+print("The number of address types = {address}".format(address=len(combi["Address"].value_counts())))
+print(combi.shape)
+combi.head()
+
+ +
+
+
+ +
+
+ + +
+
+
The number of address types = 1622
+(1762311, 33)
+
+
+
+ +
Out[12]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AddressCategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_Thursday...Dates_monthDates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpring
0OthersWARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
1OthersOTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalse...5132353011000
2OthersOTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalse...5132333011000
31500 Block of LOMBARD STLARCENY/THEFTNaN-122.42699537.800873FalseFalseFalseFalseFalse...5132330011000
4OthersLARCENY/THEFTNaN-122.43873837.771541FalseFalseFalseFalseFalse...5132330011000
+

5 rows × 33 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [13]:
+
+
+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+
+label_encoder = LabelEncoder()
+one_hot_encoder = OneHotEncoder(dtype=np.bool)
+
+combi["Address(encode)"] = label_encoder.fit_transform(combi["Address"])
+address = one_hot_encoder.fit_transform(combi[["Address(encode)"]])
+
+print(address.shape)
+address
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 1622)
+
+
+
+ +
Out[13]:
+ + +
+
<1762311x1622 sparse matrix of type '<class 'numpy.bool_'>'
+	with 1762311 stored elements in Compressed Sparse Row format>
+
+ +
+ +
+
+ +
+
+
+
In [14]:
+
+
+
train_address = address[:len(train), :]
+test_address = address[len(train):, :]
+
+print("Train = {0}".format(train_address.shape))
+print("Test = {0}".format(test_address.shape))
+
+ +
+
+
+ +
+
+ + +
+
+
Train = (878049, 1622)
+Test = (884262, 1622)
+
+
+
+ +
+
+ +
+
+
+
+
+
+

Split to train / test dataset

+
+
+
+
+
+
In [15]:
+
+
+
combi.drop("Address", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(1762311, 33)
+
+
+
+ +
Out[15]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_Tuesday...Dates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress(encode)
0WARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...1323530110001525
1OTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...1323530110001525
2OTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalseFalse...1323330110001525
+

3 rows × 33 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [16]:
+
+
+
train = combi[combi["Category"].notnull()]
+
+train.drop("Id", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 32)
+
+
+
+ +
+
+
/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  app.launch_new_instance()
+
+
+
+ +
Out[16]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_Wednesday...Dates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress(encode)
0WARRANTS-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue...1323530110001525
1OTHER OFFENSES-122.42589237.774599FalseFalseFalseFalseFalseFalseTrue...1323530110001525
2OTHER OFFENSES-122.42436337.800414FalseFalseFalseFalseFalseFalseTrue...1323330110001525
+

3 rows × 32 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [17]:
+
+
+
test = combi[combi["Category"].isnull()]
+
+test["Id"] = test["Id"].astype(np.int32)
+test.drop("Category", axis=1, inplace=True)
+
+test.set_index("Id", inplace=True)
+
+print(test.shape)
+test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 31)
+
+
+
+ +
+
+
/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  app.launch_new_instance()
+/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+
+
+
+ +
Out[17]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
XYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_TuesdayDayOfWeek_WednesdayPdDistrict_BAYVIEW...Dates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress(encode)
Id
0-122.39958837.735051FalseFalseFalseTrueFalseFalseFalseTrue...1023590110001525
1-122.39152337.732432FalseFalseFalseTrueFalseFalseFalseTrue...102351011000870
2-122.42600237.792212FalseFalseFalseTrueFalseFalseFalseFalse...1023500110001525
+

3 rows × 31 columns

+
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Score

+
+
+
+
+
+
In [18]:
+
+
+
label_name = "Category"
+feature_names = train.columns.difference([label_name])
+
+X_train = train[feature_names]
+
+print(X_train.shape)
+X_train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049, 31)
+
+
+
+ +
Out[18]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Address(encode)AwakeDates_dayDates_hourDates_minuteDates_monthDates_secondDates_yearDayOfWeek_FridayDayOfWeek_Monday...PdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOINSpringSummerWinterXY
015251132353502015FalseFalse...FalseFalseFalseFalseFalse010-122.42589237.774599
115251132353502015FalseFalse...FalseFalseFalseFalseFalse010-122.42589237.774599
215251132333502015FalseFalse...FalseFalseFalseFalseFalse010-122.42436337.800414
+

3 rows × 31 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [19]:
+
+
+
from scipy.sparse import hstack
+
+X_train = hstack((X_train.values.astype(np.float32), train_address.astype(np.float32)))
+X_train
+
+ +
+
+
+ +
+
+ + +
Out[19]:
+ + +
+
<878049x1653 sparse matrix of type '<class 'numpy.float32'>'
+	with 10919216 stored elements in COOrdinate format>
+
+ +
+ +
+
+ +
+
+
+
In [20]:
+
+
+
y_train = train[label_name]
+
+print(y_train.shape)
+y_train.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(878049,)
+
+
+
+ +
Out[20]:
+ + +
+
0          WARRANTS
+1    OTHER OFFENSES
+2    OTHER OFFENSES
+Name: Category, dtype: object
+
+ +
+ +
+
+ +
+
+
+
+
+
+

Evaluate using Naive Bayes

+
+
+
+
+
+
In [21]:
+
+
+
from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+kfold = StratifiedKFold(y_train, n_folds=6)
+
+model = BernoulliNB()
+%time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()
+score = -1.0 * score
+
+print("Use BernoulliNB. Score = {0:.6f}".format(score))
+
+ +
+
+
+ +
+
+ + +
+
+
CPU times: user 26.2 s, sys: 328 ms, total: 26.5 s
+Wall time: 26.5 s
+Use BernoulliNB. Score = 2.506260
+
+
+
+ +
+
+ +
+
+
+
+
+
+

Predict

+
+
+
+
+
+
In [22]:
+
+
+
X_test = test[feature_names]
+
+print(X_test.shape)
+X_test.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 31)
+
+
+
+ +
Out[22]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Address(encode)AwakeDates_dayDates_hourDates_minuteDates_monthDates_secondDates_yearDayOfWeek_FridayDayOfWeek_Monday...PdDistrict_PARKPdDistrict_RICHMONDPdDistrict_SOUTHERNPdDistrict_TARAVALPdDistrict_TENDERLOINSpringSummerWinterXY
Id
015251102359502015FalseFalse...FalseFalseFalseFalseFalse010-122.39958837.735051
18701102351502015FalseFalse...FalseFalseFalseFalseFalse010-122.39152337.732432
215251102350502015FalseFalse...FalseFalseFalseFalseFalse010-122.42600237.792212
+

3 rows × 31 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [23]:
+
+
+
X_test = hstack((X_test.values.astype(np.float32), test_address.astype(np.float32)))
+X_test
+
+ +
+
+
+ +
+
+ + +
Out[23]:
+ + +
+
<884262x1653 sparse matrix of type '<class 'numpy.float32'>'
+	with 10991940 stored elements in COOrdinate format>
+
+ +
+ +
+
+ +
+
+
+
In [24]:
+
+
+
from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+model = BernoulliNB()
+model.fit(X_train, y_train)
+
+prediction = model.predict_proba(X_test)
+
+print(prediction.shape)
+prediction[:1]
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[24]:
+ + +
+
array([[  2.67015496e-003,   1.19036835e-001,   1.41797670e-006,
+          2.25975272e-006,   4.06035829e-002,   8.41166681e-004,
+          2.47271562e-003,   2.06531950e-002,   1.99576325e-003,
+          8.99788805e-005,   3.34383792e-007,   1.77260520e-005,
+          3.28332745e-003,   5.14413535e-003,   4.63649041e-009,
+          1.88916649e-003,   1.02857938e-001,   4.42865274e-004,
+          8.46410933e-005,   2.97388917e-002,   5.94801550e-002,
+          2.28252074e-001,   5.23836996e-036,   7.76097826e-005,
+          8.92680945e-003,   3.61892343e-002,   3.15859366e-004,
+          1.93062581e-002,   2.27026757e-003,   2.38579445e-009,
+          3.16131055e-003,   2.81842185e-005,   4.59502391e-002,
+          8.63149800e-101,   3.36394307e-003,   7.92676555e-002,
+          1.32496504e-001,   3.18064857e-002,   1.72813063e-002]])
+
+ +
+ +
+
+ +
+
+
+
In [25]:
+
+
+
submission = pd.DataFrame(prediction, index=test.index, columns = sample.columns)
+submission = submission.reindex_axis(sorted(submission.columns), axis=1,)
+
+print(submission.shape)
+submission.head(3)
+
+ +
+
+
+ +
+
+ + +
+
+
(884262, 39)
+
+
+
+ +
Out[25]:
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ARSONASSAULTBAD CHECKSBRIBERYBURGLARYDISORDERLY CONDUCTDRIVING UNDER THE INFLUENCEDRUG/NARCOTICDRUNKENNESSEMBEZZLEMENT...SEX OFFENSES NON FORCIBLESTOLEN PROPERTYSUICIDESUSPICIOUS OCCTREATRESPASSVANDALISMVEHICLE THEFTWARRANTSWEAPON LAWS
Id
00.0026700.1190370.0000012.259753e-060.0406040.0008410.0024730.0206530.0019960.000090...2.385794e-090.0031610.0000280.0459508.631498e-1010.0033640.0792680.1324970.0318060.017281
10.0033390.1481340.0000133.174429e-050.0021750.0012450.0032850.0482590.0027180.000335...6.129728e-080.0025290.0001120.0346503.235358e-980.0056720.0067920.0270230.1020570.032735
20.0007260.0853820.0000025.634417e-070.0558580.0015970.0032430.0180140.0027080.000105...8.181694e-100.0052230.0000470.0326932.501654e-1010.0036650.0696250.0984350.0296440.006407
+

3 rows × 39 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [26]:
+
+
+
from datetime import datetime
+
+current_time = datetime.now()
+current_time = current_time.strftime("%Y%m%d%H%M%S")
+
+description = "one-hot-encode-address"
+csv_filename = "../submissions/" + current_time + "_" + description + ".csv"
+
+submission.to_csv(csv_filename)
+
+ +
+
+
+ +
+
+
+
In [27]:
+
+
+
import gzip
+
+gzip_filename = csv_filename + ".gz"
+
+f_in = open(csv_filename, "rb")
+
+f_out = gzip.open(gzip_filename, 'wb')
+f_out.writelines(f_in)
+f_out.close()
+
+f_in.close()
+
+ +
+
+
+ +
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+ + diff --git a/delivers/baseline-script.ipynb b/delivers/baseline-script.ipynb index 3f83c5b..4d2d43f 100644 --- a/delivers/baseline-script.ipynb +++ b/delivers/baseline-script.ipynb @@ -1224,7 +1224,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Split to train / test dataset" + "### One hot encode address" ] }, { @@ -1238,7 +1238,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "(1762311, 32)\n" + "The number of address types = 1622\n", + "(1762311, 33)\n" ] }, { @@ -1249,6 +1250,7 @@ " \n", " \n", " \n", + " Address\n", " Category\n", " Id\n", " X\n", @@ -1258,7 +1260,6 @@ " DayOfWeek_Saturday\n", " DayOfWeek_Sunday\n", " DayOfWeek_Thursday\n", - " DayOfWeek_Tuesday\n", " ...\n", " Dates_month\n", " Dates_day\n", @@ -1275,6 +1276,7 @@ " \n", " \n", " 0\n", + " Others\n", " WARRANTS\n", " NaN\n", " -122.425892\n", @@ -1284,7 +1286,6 @@ " False\n", " False\n", " False\n", - " False\n", " ...\n", " 5\n", " 13\n", @@ -1299,6 +1300,7 @@ " \n", " \n", " 1\n", + " Others\n", " OTHER OFFENSES\n", " NaN\n", " -122.425892\n", @@ -1308,7 +1310,6 @@ " False\n", " False\n", " False\n", - " False\n", " ...\n", " 5\n", " 13\n", @@ -1323,6 +1324,7 @@ " \n", " \n", " 2\n", + " Others\n", " OTHER OFFENSES\n", " NaN\n", " -122.424363\n", @@ -1332,11 +1334,298 @@ " False\n", " False\n", " False\n", + " ...\n", + " 5\n", + " 13\n", + " 23\n", + " 33\n", + " 0\n", + " 1\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 3\n", + " 1500 Block of LOMBARD ST\n", + " LARCENY/THEFT\n", + " NaN\n", + " -122.426995\n", + " 37.800873\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " ...\n", + " 5\n", + " 13\n", + " 23\n", + " 30\n", + " 0\n", + " 1\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 4\n", + " Others\n", + " LARCENY/THEFT\n", + " NaN\n", + " -122.438738\n", + " 37.771541\n", + " False\n", + " False\n", + " False\n", + " False\n", " False\n", " ...\n", " 5\n", " 13\n", " 23\n", + " 30\n", + " 0\n", + " 1\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " \n", + " \n", + "\n", + "

5 rows × 33 columns

\n", + "" + ], + "text/plain": [ + " Address Category Id X Y \\\n", + "0 Others WARRANTS NaN -122.425892 37.774599 \n", + "1 Others OTHER OFFENSES NaN -122.425892 37.774599 \n", + "2 Others OTHER OFFENSES NaN -122.424363 37.800414 \n", + "3 1500 Block of LOMBARD ST LARCENY/THEFT NaN -122.426995 37.800873 \n", + "4 Others LARCENY/THEFT NaN -122.438738 37.771541 \n", + "\n", + " DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday DayOfWeek_Sunday \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " DayOfWeek_Thursday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", + "0 False ... 5 13 23 53 \n", + "1 False ... 5 13 23 53 \n", + "2 False ... 5 13 23 33 \n", + "3 False ... 5 13 23 30 \n", + "4 False ... 5 13 23 30 \n", + "\n", + " Dates_second Awake Summer Fall Winter Spring \n", + "0 0 1 1 0 0 0 \n", + "1 0 1 1 0 0 0 \n", + "2 0 1 1 0 0 0 \n", + "3 0 1 1 0 0 0 \n", + "4 0 1 1 0 0 0 \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 누적값이 200개 이하인 경우는 'Others'로 바꾼다.\n", + "address_counts = combi[\"Address\"].value_counts()\n", + "other_index = address_counts[address_counts < 200].index\n", + "combi.loc[combi[\"Address\"].isin(other_index), \"Address\"] = \"Others\"\n", + "\n", + "print(\"The number of address types = {address}\".format(address=len(combi[\"Address\"].value_counts())))\n", + "print(combi.shape)\n", + "combi.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1762311, 1622)\n" + ] + }, + { + "data": { + "text/plain": [ + "<1762311x1622 sparse matrix of type ''\n", + "\twith 1762311 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", + "\n", + "label_encoder = LabelEncoder()\n", + "one_hot_encoder = OneHotEncoder(dtype=np.bool)\n", + "\n", + "combi[\"Address(encode)\"] = label_encoder.fit_transform(combi[\"Address\"])\n", + "address = one_hot_encoder.fit_transform(combi[[\"Address(encode)\"]])\n", + "\n", + "print(address.shape)\n", + "address" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train = (878049, 1622)\n", + "Test = (884262, 1622)\n" + ] + } + ], + "source": [ + "train_address = address[:len(train), :]\n", + "test_address = address[len(train):, :]\n", + "\n", + "print(\"Train = {0}\".format(train_address.shape))\n", + "print(\"Test = {0}\".format(test_address.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split to train / test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1762311, 33)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1344,10 +1633,11 @@ " \n", " \n", " \n", + " \n", " \n", " \n", "
CategoryIdXYDayOfWeek_FridayDayOfWeek_MondayDayOfWeek_SaturdayDayOfWeek_SundayDayOfWeek_ThursdayDayOfWeek_Tuesday...Dates_dayDates_hourDates_minuteDates_secondAwakeSummerFallWinterSpringAddress(encode)
0WARRANTSNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...1323530110001525
1OTHER OFFENSESNaN-122.42589237.774599FalseFalseFalseFalseFalseFalse...1323530110001525
2OTHER OFFENSESNaN-122.42436337.800414FalseFalseFalseFalseFalseFalse...132333010001525
\n", - "

3 rows × 32 columns

\n", + "

3 rows × 33 columns

\n", "
" ], "text/plain": [ @@ -1361,20 +1651,20 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Tuesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 False ... 5 13 23 53 \n", - "1 False ... 5 13 23 53 \n", - "2 False ... 5 13 23 33 \n", + " DayOfWeek_Tuesday ... Dates_day Dates_hour Dates_minute \\\n", + "0 False ... 13 23 53 \n", + "1 False ... 13 23 53 \n", + "2 False ... 13 23 33 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Dates_second Awake Summer Fall Winter Spring Address(encode) \n", + "0 0 1 1 0 0 0 1525 \n", + "1 0 1 1 0 0 0 1525 \n", + "2 0 1 1 0 0 0 1525 \n", "\n", - "[3 rows x 32 columns]" + "[3 rows x 33 columns]" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1388,7 +1678,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": { "collapsed": false }, @@ -1397,14 +1687,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 31)\n" + "(878049, 32)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", @@ -1430,7 +1720,6 @@ " DayOfWeek_Tuesday\n", " DayOfWeek_Wednesday\n", " ...\n", - " Dates_month\n", " Dates_day\n", " Dates_hour\n", " Dates_minute\n", @@ -1440,6 +1729,7 @@ " Fall\n", " Winter\n", " Spring\n", + " Address(encode)\n", " \n", " \n", " \n", @@ -1456,7 +1746,6 @@ " False\n", " True\n", " ...\n", - " 5\n", " 13\n", " 23\n", " 53\n", @@ -1466,6 +1755,7 @@ " 0\n", " 0\n", " 0\n", + " 1525\n", " \n", " \n", " 1\n", @@ -1480,7 +1770,6 @@ " False\n", " True\n", " ...\n", - " 5\n", " 13\n", " 23\n", " 53\n", @@ -1490,6 +1779,7 @@ " 0\n", " 0\n", " 0\n", + " 1525\n", " \n", " \n", " 2\n", @@ -1504,7 +1794,6 @@ " False\n", " True\n", " ...\n", - " 5\n", " 13\n", " 23\n", " 33\n", @@ -1514,10 +1803,11 @@ " 0\n", " 0\n", " 0\n", + " 1525\n", " \n", " \n", "\n", - "

3 rows × 31 columns

\n", + "

3 rows × 32 columns

\n", "" ], "text/plain": [ @@ -1531,20 +1821,20 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Wednesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 True ... 5 13 23 53 \n", - "1 True ... 5 13 23 53 \n", - "2 True ... 5 13 23 33 \n", + " DayOfWeek_Wednesday ... Dates_day Dates_hour Dates_minute \\\n", + "0 True ... 13 23 53 \n", + "1 True ... 13 23 53 \n", + "2 True ... 13 23 33 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Dates_second Awake Summer Fall Winter Spring Address(encode) \n", + "0 0 1 1 0 0 0 1525 \n", + "1 0 1 1 0 0 0 1525 \n", + "2 0 1 1 0 0 0 1525 \n", "\n", - "[3 rows x 31 columns]" + "[3 rows x 32 columns]" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1560,7 +1850,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -1569,20 +1859,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 31)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " app.launch_new_instance()\n", - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" @@ -1607,7 +1897,6 @@ " DayOfWeek_Wednesday\n", " PdDistrict_BAYVIEW\n", " ...\n", - " Dates_month\n", " Dates_day\n", " Dates_hour\n", " Dates_minute\n", @@ -1617,6 +1906,7 @@ " Fall\n", " Winter\n", " Spring\n", + " Address(encode)\n", " \n", " \n", " Id\n", @@ -1657,7 +1947,6 @@ " False\n", " True\n", " ...\n", - " 5\n", " 10\n", " 23\n", " 59\n", @@ -1667,6 +1956,7 @@ " 0\n", " 0\n", " 0\n", + " 1525\n", " \n", " \n", " 1\n", @@ -1681,7 +1971,6 @@ " False\n", " True\n", " ...\n", - " 5\n", " 10\n", " 23\n", " 51\n", @@ -1691,6 +1980,7 @@ " 0\n", " 0\n", " 0\n", + " 870\n", " \n", " \n", " 2\n", @@ -1705,7 +1995,6 @@ " False\n", " False\n", " ...\n", - " 5\n", " 10\n", " 23\n", " 50\n", @@ -1715,10 +2004,11 @@ " 0\n", " 0\n", " 0\n", + " 1525\n", " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 31 columns

\n", "" ], "text/plain": [ @@ -1734,22 +2024,28 @@ "1 False True False False \n", "2 False True False False \n", "\n", - " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_month Dates_day \\\n", - "Id ... \n", - "0 False True ... 5 10 \n", - "1 False True ... 5 10 \n", - "2 False False ... 5 10 \n", + " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_day \\\n", + "Id ... \n", + "0 False True ... 10 \n", + "1 False True ... 10 \n", + "2 False False ... 10 \n", "\n", - " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \n", + " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \\\n", "Id \n", - "0 23 59 0 1 1 0 0 0 \n", - "1 23 51 0 1 1 0 0 0 \n", - "2 23 50 0 1 1 0 0 0 \n", + "0 23 59 0 1 1 0 0 0 \n", + "1 23 51 0 1 1 0 0 0 \n", + "2 23 50 0 1 1 0 0 0 \n", "\n", - "[3 rows x 30 columns]" + " Address(encode) \n", + "Id \n", + "0 1525 \n", + "1 870 \n", + "2 1525 \n", + "\n", + "[3 rows x 31 columns]" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1775,7 +2071,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -1784,7 +2080,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 30)\n" + "(878049, 31)\n" ] }, { @@ -1795,6 +2091,7 @@ " \n", " \n", " \n", + " Address(encode)\n", " Awake\n", " Dates_day\n", " Dates_hour\n", @@ -1804,7 +2101,6 @@ " Dates_year\n", " DayOfWeek_Friday\n", " DayOfWeek_Monday\n", - " DayOfWeek_Saturday\n", " ...\n", " PdDistrict_PARK\n", " PdDistrict_RICHMOND\n", @@ -1821,6 +2117,7 @@ " \n", " \n", " 0\n", + " 1525\n", " 1\n", " 13\n", " 23\n", @@ -1830,7 +2127,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1845,6 +2141,7 @@ " \n", " \n", " 1\n", + " 1525\n", " 1\n", " 13\n", " 23\n", @@ -1854,7 +2151,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1869,6 +2165,7 @@ " \n", " \n", " 2\n", + " 1525\n", " 1\n", " 13\n", " 23\n", @@ -1878,7 +2175,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -1893,34 +2189,34 @@ " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 31 columns

\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "0 1 13 23 53 5 0 \n", - "1 1 13 23 53 5 0 \n", - "2 1 13 23 33 5 0 \n", + " Address(encode) Awake Dates_day Dates_hour Dates_minute Dates_month \\\n", + "0 1525 1 13 23 53 5 \n", + "1 1525 1 13 23 53 5 \n", + "2 1525 1 13 23 33 5 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday ... \\\n", - "0 2015 False False False ... \n", - "1 2015 False False False ... \n", - "2 2015 False False False ... \n", + " Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday ... \\\n", + "0 0 2015 False False ... \n", + "1 0 2015 False False ... \n", + "2 0 2015 False False ... \n", "\n", " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "\n", - " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", - "0 False 0 1 0 -122.425892 37.774599 \n", - "1 False 0 1 0 -122.425892 37.774599 \n", - "2 False 0 1 0 -122.424363 37.800414 \n", + " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", + "0 False 0 1 0 -122.425892 37.774599 \n", + "1 False 0 1 0 -122.425892 37.774599 \n", + "2 False 0 1 0 -122.424363 37.800414 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1937,7 +2233,33 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<878049x1653 sparse matrix of type ''\n", + "\twith 10919216 stored elements in COOrdinate format>" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.sparse import hstack\n", + "\n", + "X_train = hstack((X_train.values.astype(np.float32), train_address.astype(np.float32)))\n", + "X_train" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "collapsed": false }, @@ -1958,7 +2280,7 @@ "Name: Category, dtype: object" ] }, - "execution_count": 16, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1979,7 +2301,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -1988,9 +2310,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 55.2 s, sys: 5.56 s, total: 1min\n", - "Wall time: 1min\n", - "Use BernoulliNB. Score = 2.562140\n" + "CPU times: user 26.2 s, sys: 328 ms, total: 26.5 s\n", + "Wall time: 26.5 s\n", + "Use BernoulliNB. Score = 2.506260\n" ] } ], @@ -2016,7 +2338,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -2025,7 +2347,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 31)\n" ] }, { @@ -2036,6 +2358,7 @@ " \n", " \n", " \n", + " Address(encode)\n", " Awake\n", " Dates_day\n", " Dates_hour\n", @@ -2045,7 +2368,6 @@ " Dates_year\n", " DayOfWeek_Friday\n", " DayOfWeek_Monday\n", - " DayOfWeek_Saturday\n", " ...\n", " PdDistrict_PARK\n", " PdDistrict_RICHMOND\n", @@ -2086,6 +2408,7 @@ " \n", " \n", " 0\n", + " 1525\n", " 1\n", " 10\n", " 23\n", @@ -2095,7 +2418,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2110,6 +2432,7 @@ " \n", " \n", " 1\n", + " 870\n", " 1\n", " 10\n", " 23\n", @@ -2119,7 +2442,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2134,6 +2456,7 @@ " \n", " \n", " 2\n", + " 1525\n", " 1\n", " 10\n", " 23\n", @@ -2143,7 +2466,6 @@ " 2015\n", " False\n", " False\n", - " False\n", " ...\n", " False\n", " False\n", @@ -2158,44 +2480,38 @@ " \n", " \n", "\n", - "

3 rows × 30 columns

\n", + "

3 rows × 31 columns

\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "Id \n", - "0 1 10 23 59 5 0 \n", - "1 1 10 23 51 5 0 \n", - "2 1 10 23 50 5 0 \n", + " Address(encode) Awake Dates_day Dates_hour Dates_minute Dates_month \\\n", + "Id \n", + "0 1525 1 10 23 59 5 \n", + "1 870 1 10 23 51 5 \n", + "2 1525 1 10 23 50 5 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday \\\n", - "Id \n", - "0 2015 False False False \n", - "1 2015 False False False \n", - "2 2015 False False False \n", + " Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday ... \\\n", + "Id ... \n", + "0 0 2015 False False ... \n", + "1 0 2015 False False ... \n", + "2 0 2015 False False ... \n", "\n", - " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", - "Id ... \n", - "0 ... False False False \n", - "1 ... False False False \n", - "2 ... False False False \n", - "\n", - " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", "Id \n", - "0 False False 0 1 0 -122.399588 \n", - "1 False False 0 1 0 -122.391523 \n", - "2 False False 0 1 0 -122.426002 \n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", "\n", - " Y \n", - "Id \n", - "0 37.735051 \n", - "1 37.732432 \n", - "2 37.792212 \n", + " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", + "Id \n", + "0 False 0 1 0 -122.399588 37.735051 \n", + "1 False 0 1 0 -122.391523 37.732432 \n", + "2 False 0 1 0 -122.426002 37.792212 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2525,31 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<884262x1653 sparse matrix of type ''\n", + "\twith 10991940 stored elements in COOrdinate format>" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test = hstack((X_test.values.astype(np.float32), test_address.astype(np.float32)))\n", + "X_test" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "metadata": { "collapsed": false }, @@ -2224,22 +2564,22 @@ { "data": { "text/plain": [ - "array([[ 5.18022914e-03, 1.47306845e-01, 8.35309027e-05,\n", - " 7.15327047e-04, 2.89343463e-02, 2.17660902e-03,\n", - " 2.90469652e-03, 5.20272506e-02, 4.75338148e-03,\n", - " 4.53910576e-04, 1.50404661e-04, 6.33194320e-04,\n", - " 4.46497596e-03, 5.83197476e-03, 3.21888082e-04,\n", - " 3.66372241e-03, 9.46729466e-02, 1.63443121e-03,\n", - " 6.10377668e-04, 4.41001971e-02, 6.88112197e-02,\n", - " 2.06289853e-01, 6.30319698e-06, 3.51815041e-04,\n", - " 8.06585264e-03, 3.64767735e-02, 2.11525827e-03,\n", - " 2.31851049e-02, 3.90020973e-03, 1.39053149e-04,\n", - " 4.88590719e-03, 4.72963357e-04, 4.44303697e-02,\n", - " 8.03404588e-07, 7.22208250e-03, 5.50342776e-02,\n", - " 5.65854147e-02, 5.66664063e-02, 2.47400928e-02]])" + "array([[ 2.67015496e-003, 1.19036835e-001, 1.41797670e-006,\n", + " 2.25975272e-006, 4.06035829e-002, 8.41166681e-004,\n", + " 2.47271562e-003, 2.06531950e-002, 1.99576325e-003,\n", + " 8.99788805e-005, 3.34383792e-007, 1.77260520e-005,\n", + " 3.28332745e-003, 5.14413535e-003, 4.63649041e-009,\n", + " 1.88916649e-003, 1.02857938e-001, 4.42865274e-004,\n", + " 8.46410933e-005, 2.97388917e-002, 5.94801550e-002,\n", + " 2.28252074e-001, 5.23836996e-036, 7.76097826e-005,\n", + " 8.92680945e-003, 3.61892343e-002, 3.15859366e-004,\n", + " 1.93062581e-002, 2.27026757e-003, 2.38579445e-009,\n", + " 3.16131055e-003, 2.81842185e-005, 4.59502391e-002,\n", + " 8.63149800e-101, 3.36394307e-003, 7.92676555e-002,\n", + " 1.32496504e-001, 3.18064857e-002, 1.72813063e-002]])" ] }, - "execution_count": 19, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2259,7 +2599,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "metadata": { "collapsed": false }, @@ -2329,75 +2669,75 @@ " \n", " \n", " 0\n", - " 0.005180\n", - " 0.147307\n", - " 0.000084\n", - " 0.000715\n", - " 0.028934\n", - " 0.002177\n", - " 0.002905\n", - " 0.052027\n", - " 0.004753\n", - " 0.000454\n", + " 0.002670\n", + " 0.119037\n", + " 0.000001\n", + " 2.259753e-06\n", + " 0.040604\n", + " 0.000841\n", + " 0.002473\n", + " 0.020653\n", + " 0.001996\n", + " 0.000090\n", " ...\n", - " 0.000139\n", - " 0.004886\n", - " 0.000473\n", - " 0.044430\n", - " 8.034046e-07\n", - " 0.007222\n", - " 0.055034\n", - " 0.056585\n", - " 0.056666\n", - " 0.024740\n", + " 2.385794e-09\n", + " 0.003161\n", + " 0.000028\n", + " 0.045950\n", + " 8.631498e-101\n", + " 0.003364\n", + " 0.079268\n", + " 0.132497\n", + " 0.031806\n", + " 0.017281\n", " \n", " \n", " 1\n", - " 0.005180\n", - " 0.147307\n", - " 0.000084\n", - " 0.000715\n", - " 0.028934\n", - " 0.002177\n", - " 0.002905\n", - " 0.052027\n", - " 0.004753\n", - " 0.000454\n", + " 0.003339\n", + " 0.148134\n", + " 0.000013\n", + " 3.174429e-05\n", + " 0.002175\n", + " 0.001245\n", + " 0.003285\n", + " 0.048259\n", + " 0.002718\n", + " 0.000335\n", " ...\n", - " 0.000139\n", - " 0.004886\n", - " 0.000473\n", - " 0.044430\n", - " 8.034046e-07\n", - " 0.007222\n", - " 0.055034\n", - " 0.056585\n", - " 0.056666\n", - " 0.024740\n", + " 6.129728e-08\n", + " 0.002529\n", + " 0.000112\n", + " 0.034650\n", + " 3.235358e-98\n", + " 0.005672\n", + " 0.006792\n", + " 0.027023\n", + " 0.102057\n", + " 0.032735\n", " \n", " \n", " 2\n", - " 0.001412\n", - " 0.105913\n", - " 0.000121\n", - " 0.000179\n", - " 0.039900\n", - " 0.004142\n", - " 0.003819\n", - " 0.045489\n", - " 0.006465\n", - " 0.000533\n", + " 0.000726\n", + " 0.085382\n", + " 0.000002\n", + " 5.634417e-07\n", + " 0.055858\n", + " 0.001597\n", + " 0.003243\n", + " 0.018014\n", + " 0.002708\n", + " 0.000105\n", " ...\n", - " 0.000048\n", - " 0.008092\n", - " 0.000788\n", - " 0.031688\n", - " 2.334092e-07\n", - " 0.007887\n", - " 0.048455\n", - " 0.042140\n", - " 0.052941\n", - " 0.009195\n", + " 8.181694e-10\n", + " 0.005223\n", + " 0.000047\n", + " 0.032693\n", + " 2.501654e-101\n", + " 0.003665\n", + " 0.069625\n", + " 0.098435\n", + " 0.029644\n", + " 0.006407\n", " \n", " \n", "\n", @@ -2405,46 +2745,46 @@ "" ], "text/plain": [ - " ARSON ASSAULT BAD CHECKS BRIBERY BURGLARY DISORDERLY CONDUCT \\\n", - "Id \n", - "0 0.005180 0.147307 0.000084 0.000715 0.028934 0.002177 \n", - "1 0.005180 0.147307 0.000084 0.000715 0.028934 0.002177 \n", - "2 0.001412 0.105913 0.000121 0.000179 0.039900 0.004142 \n", + " ARSON ASSAULT BAD CHECKS BRIBERY BURGLARY \\\n", + "Id \n", + "0 0.002670 0.119037 0.000001 2.259753e-06 0.040604 \n", + "1 0.003339 0.148134 0.000013 3.174429e-05 0.002175 \n", + "2 0.000726 0.085382 0.000002 5.634417e-07 0.055858 \n", "\n", - " DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC DRUNKENNESS EMBEZZLEMENT \\\n", - "Id \n", - "0 0.002905 0.052027 0.004753 0.000454 \n", - "1 0.002905 0.052027 0.004753 0.000454 \n", - "2 0.003819 0.045489 0.006465 0.000533 \n", + " DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC \\\n", + "Id \n", + "0 0.000841 0.002473 0.020653 \n", + "1 0.001245 0.003285 0.048259 \n", + "2 0.001597 0.003243 0.018014 \n", "\n", - " ... SEX OFFENSES NON FORCIBLE STOLEN PROPERTY SUICIDE \\\n", - "Id ... \n", - "0 ... 0.000139 0.004886 0.000473 \n", - "1 ... 0.000139 0.004886 0.000473 \n", - "2 ... 0.000048 0.008092 0.000788 \n", + " DRUNKENNESS EMBEZZLEMENT ... SEX OFFENSES NON FORCIBLE \\\n", + "Id ... \n", + "0 0.001996 0.000090 ... 2.385794e-09 \n", + "1 0.002718 0.000335 ... 6.129728e-08 \n", + "2 0.002708 0.000105 ... 8.181694e-10 \n", "\n", - " SUSPICIOUS OCC TREA TRESPASS VANDALISM VEHICLE THEFT \\\n", - "Id \n", - "0 0.044430 8.034046e-07 0.007222 0.055034 0.056585 \n", - "1 0.044430 8.034046e-07 0.007222 0.055034 0.056585 \n", - "2 0.031688 2.334092e-07 0.007887 0.048455 0.042140 \n", + " STOLEN PROPERTY SUICIDE SUSPICIOUS OCC TREA TRESPASS \\\n", + "Id \n", + "0 0.003161 0.000028 0.045950 8.631498e-101 0.003364 \n", + "1 0.002529 0.000112 0.034650 3.235358e-98 0.005672 \n", + "2 0.005223 0.000047 0.032693 2.501654e-101 0.003665 \n", "\n", - " WARRANTS WEAPON LAWS \n", - "Id \n", - "0 0.056666 0.024740 \n", - "1 0.056666 0.024740 \n", - "2 0.052941 0.009195 \n", + " VANDALISM VEHICLE THEFT WARRANTS WEAPON LAWS \n", + "Id \n", + "0 0.079268 0.132497 0.031806 0.017281 \n", + "1 0.006792 0.027023 0.102057 0.032735 \n", + "2 0.069625 0.098435 0.029644 0.006407 \n", "\n", "[3 rows x 39 columns]" ] }, - "execution_count": 20, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "submission = pd.DataFrame(prediction, index=X_test.index, columns = sample.columns)\n", + "submission = pd.DataFrame(prediction, index=test.index, columns = sample.columns)\n", "submission = submission.reindex_axis(sorted(submission.columns), axis=1,)\n", "\n", "print(submission.shape)\n", @@ -2453,7 +2793,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "metadata": { "collapsed": false }, @@ -2464,14 +2804,15 @@ "current_time = datetime.now()\n", "current_time = current_time.strftime(\"%Y%m%d%H%M%S\")\n", "\n", - "csv_filename = \"../submissions/\" + current_time + \"_\" + \"baseline_script.csv\"\n", + "description = \"one-hot-encode-address\"\n", + "csv_filename = \"../submissions/\" + current_time + \"_\" + description + \".csv\"\n", "\n", "submission.to_csv(csv_filename)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "metadata": { "collapsed": false }, @@ -2489,26 +2830,35 @@ "\n", "f_in.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [Root]", + "display_name": "Python 3", "language": "python", - "name": "Python [Root]" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/delivers/baseline-script.py b/delivers/baseline-script.py index 3fc7b34..218ac04 100644 --- a/delivers/baseline-script.py +++ b/delivers/baseline-script.py @@ -128,17 +128,54 @@ def get_season(x): combi.head(3) -# ### Split to train / test dataset +# ### One hot encode address # In[12]: +# 누적값이 200개 이하인 경우는 'Others'로 바꾼다. +address_counts = combi["Address"].value_counts() +other_index = address_counts[address_counts < 200].index +combi.loc[combi["Address"].isin(other_index), "Address"] = "Others" + +print("The number of address types = {address}".format(address=len(combi["Address"].value_counts()))) +print(combi.shape) +combi.head() + + +# In[13]: + +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + +label_encoder = LabelEncoder() +one_hot_encoder = OneHotEncoder(dtype=np.bool) + +combi["Address(encode)"] = label_encoder.fit_transform(combi["Address"]) +address = one_hot_encoder.fit_transform(combi[["Address(encode)"]]) + +print(address.shape) +address + + +# In[14]: + +train_address = address[:len(train), :] +test_address = address[len(train):, :] + +print("Train = {0}".format(train_address.shape)) +print("Test = {0}".format(test_address.shape)) + + +# ### Split to train / test dataset + +# In[15]: + combi.drop("Address", axis=1, inplace=True) print(combi.shape) combi.head(3) -# In[13]: +# In[16]: train = combi[combi["Category"].notnull()] @@ -148,7 +185,7 @@ def get_season(x): train.head(3) -# In[14]: +# In[17]: test = combi[combi["Category"].isnull()] @@ -163,7 +200,7 @@ def get_season(x): # ## Score -# In[15]: +# In[18]: label_name = "Category" feature_names = train.columns.difference([label_name]) @@ -174,7 +211,15 @@ def get_season(x): X_train.head(3) -# In[16]: +# In[19]: + +from scipy.sparse import hstack + +X_train = hstack((X_train.values.astype(np.float32), train_address.astype(np.float32))) +X_train + + +# In[20]: y_train = train[label_name] @@ -184,7 +229,7 @@ def get_season(x): # ### Evaluate using Naive Bayes -# In[17]: +# In[21]: from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import cross_val_score, StratifiedKFold @@ -192,7 +237,7 @@ def get_season(x): kfold = StratifiedKFold(y_train, n_folds=6) model = BernoulliNB() -get_ipython().magic(u"time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()") +get_ipython().magic("time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()") score = -1.0 * score print("Use BernoulliNB. Score = {0:.6f}".format(score)) @@ -200,7 +245,7 @@ def get_season(x): # ## Predict -# In[18]: +# In[22]: X_test = test[feature_names] @@ -208,7 +253,13 @@ def get_season(x): X_test.head(3) -# In[19]: +# In[23]: + +X_test = hstack((X_test.values.astype(np.float32), test_address.astype(np.float32))) +X_test + + +# In[24]: from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import cross_val_score, StratifiedKFold @@ -222,28 +273,29 @@ def get_season(x): prediction[:1] -# In[20]: +# In[25]: -submission = pd.DataFrame(prediction, index=X_test.index, columns = sample.columns) +submission = pd.DataFrame(prediction, index=test.index, columns = sample.columns) submission = submission.reindex_axis(sorted(submission.columns), axis=1,) print(submission.shape) submission.head(3) -# In[21]: +# In[26]: from datetime import datetime current_time = datetime.now() current_time = current_time.strftime("%Y%m%d%H%M%S") -csv_filename = "../submissions/" + current_time + "_" + "baseline_script.csv" +description = "one-hot-encode-address" +csv_filename = "../submissions/" + current_time + "_" + description + ".csv" submission.to_csv(csv_filename) -# In[22]: +# In[27]: import gzip @@ -257,3 +309,8 @@ def get_season(x): f_in.close() + +# In[ ]: + + +