diff --git a/delivers/baseline-script.html b/delivers/baseline-script.html new file mode 100644 index 0000000..395f971 --- /dev/null +++ b/delivers/baseline-script.html @@ -0,0 +1,14409 @@ + + +
+Baseline script. Hope this helps.
+ +import numpy as np
+import pandas as pd
+train = pd.read_csv("../data/train.csv", parse_dates=["Dates"])
+
+train.drop("Resolution", axis=1, inplace=True)
+train.drop("Descript", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+test = pd.read_csv("../data/test.csv", parse_dates=["Dates"])
+
+print(test.shape)
+test.head(3)
+sample = pd.read_csv("../data/sampleSubmission.csv", index_col="Id")
+
+print(sample.shape)
+sample.head(3)
+combi = pd.concat([train, test])
+
+print(combi.shape)
+combi.head(3)
+print(combi["DayOfWeek"].unique())
+
+day_of_week_dataframe = pd.get_dummies(combi["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)
+print(day_of_week_dataframe.shape)
+day_of_week_dataframe.head(3)
+combi = pd.concat([combi, day_of_week_dataframe], axis=1)
+combi.drop("DayOfWeek", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+print(combi["PdDistrict"].unique())
+
+pd_district_dataframe = pd.get_dummies(combi["PdDistrict"], prefix="PdDistrict").astype(np.bool)
+
+print(pd_district_dataframe.shape)
+pd_district_dataframe.head(3)
+combi = pd.concat([combi, pd_district_dataframe], axis=1)
+combi.drop("PdDistrict", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+def get_season(x):
+ summer=0
+ fall=0
+ winter=0
+ spring=0
+ if (x in [5, 6, 7]):
+ summer=1
+ if (x in [8, 9, 10]):
+ fall=1
+ if (x in [11, 0, 1]):
+ winter=1
+ if (x in [2, 3, 4]):
+ spring=1
+ return summer, fall, winter, spring
+combi["Dates_year"] = combi["Dates"].dt.year
+combi["Dates_month"] = combi["Dates"].dt.month
+combi["Dates_day"] = combi["Dates"].dt.day
+combi["Dates_hour"] = combi["Dates"].dt.hour
+combi["Dates_minute"] = combi["Dates"].dt.minute
+combi["Dates_second"] = combi["Dates"].dt.second
+combi["Awake"] = combi["Dates_hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
+combi["Summer"], combi["Fall"], combi["Winter"], combi["Spring"]=zip(*combi["Dates_month"].apply(get_season))
+
+combi.drop("Dates", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+# 누적값이 200개 이하인 경우는 'Others'로 바꾼다.
+address_counts = combi["Address"].value_counts()
+other_index = address_counts[address_counts < 200].index
+combi.loc[combi["Address"].isin(other_index), "Address"] = "Others"
+
+print("The number of address types = {address}".format(address=len(combi["Address"].value_counts())))
+print(combi.shape)
+combi.head()
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+
+label_encoder = LabelEncoder()
+one_hot_encoder = OneHotEncoder(dtype=np.bool)
+
+combi["Address(encode)"] = label_encoder.fit_transform(combi["Address"])
+address = one_hot_encoder.fit_transform(combi[["Address(encode)"]])
+
+print(address.shape)
+address
+train_address = address[:len(train), :]
+test_address = address[len(train):, :]
+
+print("Train = {0}".format(train_address.shape))
+print("Test = {0}".format(test_address.shape))
+combi.drop("Address", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+train = combi[combi["Category"].notnull()]
+
+train.drop("Id", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+test = combi[combi["Category"].isnull()]
+
+test["Id"] = test["Id"].astype(np.int32)
+test.drop("Category", axis=1, inplace=True)
+
+test.set_index("Id", inplace=True)
+
+print(test.shape)
+test.head(3)
+label_name = "Category"
+feature_names = train.columns.difference([label_name])
+
+X_train = train[feature_names]
+
+print(X_train.shape)
+X_train.head(3)
+from scipy.sparse import hstack
+
+X_train = hstack((X_train.values.astype(np.float32), train_address.astype(np.float32)))
+X_train
+y_train = train[label_name]
+
+print(y_train.shape)
+y_train.head(3)
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+kfold = StratifiedKFold(y_train, n_folds=6)
+
+model = BernoulliNB()
+%time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()
+score = -1.0 * score
+
+print("Use BernoulliNB. Score = {0:.6f}".format(score))
+X_test = test[feature_names]
+
+print(X_test.shape)
+X_test.head(3)
+X_test = hstack((X_test.values.astype(np.float32), test_address.astype(np.float32)))
+X_test
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+model = BernoulliNB()
+model.fit(X_train, y_train)
+
+prediction = model.predict_proba(X_test)
+
+print(prediction.shape)
+prediction[:1]
+submission = pd.DataFrame(prediction, index=test.index, columns = sample.columns)
+submission = submission.reindex_axis(sorted(submission.columns), axis=1,)
+
+print(submission.shape)
+submission.head(3)
+from datetime import datetime
+
+current_time = datetime.now()
+current_time = current_time.strftime("%Y%m%d%H%M%S")
+
+description = "one-hot-encode-address"
+csv_filename = "../submissions/" + current_time + "_" + description + ".csv"
+
+submission.to_csv(csv_filename)
+import gzip
+
+gzip_filename = csv_filename + ".gz"
+
+f_in = open(csv_filename, "rb")
+
+f_out = gzip.open(gzip_filename, 'wb')
+f_out.writelines(f_in)
+f_out.close()
+
+f_in.close()
+
+5 rows × 33 columns
\n", + "" + ], + "text/plain": [ + " Address Category Id X Y \\\n", + "0 Others WARRANTS NaN -122.425892 37.774599 \n", + "1 Others OTHER OFFENSES NaN -122.425892 37.774599 \n", + "2 Others OTHER OFFENSES NaN -122.424363 37.800414 \n", + "3 1500 Block of LOMBARD ST LARCENY/THEFT NaN -122.426995 37.800873 \n", + "4 Others LARCENY/THEFT NaN -122.438738 37.771541 \n", + "\n", + " DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday DayOfWeek_Sunday \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " DayOfWeek_Thursday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", + "0 False ... 5 13 23 53 \n", + "1 False ... 5 13 23 53 \n", + "2 False ... 5 13 23 33 \n", + "3 False ... 5 13 23 30 \n", + "4 False ... 5 13 23 30 \n", + "\n", + " Dates_second Awake Summer Fall Winter Spring \n", + "0 0 1 1 0 0 0 \n", + "1 0 1 1 0 0 0 \n", + "2 0 1 1 0 0 0 \n", + "3 0 1 1 0 0 0 \n", + "4 0 1 1 0 0 0 \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 누적값이 200개 이하인 경우는 'Others'로 바꾼다.\n", + "address_counts = combi[\"Address\"].value_counts()\n", + "other_index = address_counts[address_counts < 200].index\n", + "combi.loc[combi[\"Address\"].isin(other_index), \"Address\"] = \"Others\"\n", + "\n", + "print(\"The number of address types = {address}\".format(address=len(combi[\"Address\"].value_counts())))\n", + "print(combi.shape)\n", + "combi.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1762311, 1622)\n" + ] + }, + { + "data": { + "text/plain": [ + "<1762311x1622 sparse matrix of type '| \n", + " | Category | \n", + "Id | \n", + "X | \n", + "Y | \n", + "DayOfWeek_Friday | \n", + "DayOfWeek_Monday | \n", + "DayOfWeek_Saturday | \n", + "DayOfWeek_Sunday | \n", + "DayOfWeek_Thursday | \n", + "DayOfWeek_Tuesday | \n", + "... | \n", + "Dates_day | \n", + "Dates_hour | \n", + "Dates_minute | \n", + "Dates_second | \n", + "Awake | \n", + "Summer | \n", + "Fall | \n", + "Winter | \n", + "Spring | \n", + "Address(encode) | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "WARRANTS | \n", + "NaN | \n", + "-122.425892 | \n", + "37.774599 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "13 | \n", + "23 | \n", + "53 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1525 | \n", + "
| 1 | \n", + "OTHER OFFENSES | \n", + "NaN | \n", + "-122.425892 | \n", + "37.774599 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "13 | \n", + "23 | \n", + "53 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1525 | \n", + "
| 2 | \n", + "OTHER OFFENSES | \n", + "NaN | \n", + "-122.424363 | \n", + "37.800414 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "13 | \n", + "23 | \n", "33 | \n", "0 | \n", "1 | \n", @@ -1344,10 +1633,11 @@ "0 | \n", "0 | \n", "0 | \n", + "1525 | \n", "
3 rows × 32 columns
\n", + "3 rows × 33 columns
\n", "3 rows × 31 columns
\n", + "3 rows × 32 columns
\n", "" ], "text/plain": [ @@ -1531,20 +1821,20 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Wednesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 True ... 5 13 23 53 \n", - "1 True ... 5 13 23 53 \n", - "2 True ... 5 13 23 33 \n", + " DayOfWeek_Wednesday ... Dates_day Dates_hour Dates_minute \\\n", + "0 True ... 13 23 53 \n", + "1 True ... 13 23 53 \n", + "2 True ... 13 23 33 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Dates_second Awake Summer Fall Winter Spring Address(encode) \n", + "0 0 1 1 0 0 0 1525 \n", + "1 0 1 1 0 0 0 1525 \n", + "2 0 1 1 0 0 0 1525 \n", "\n", - "[3 rows x 31 columns]" + "[3 rows x 32 columns]" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1560,7 +1850,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -1569,20 +1859,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 31)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " app.launch_new_instance()\n", - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" @@ -1607,7 +1897,6 @@ "3 rows × 30 columns
\n", + "3 rows × 31 columns
\n", "" ], "text/plain": [ @@ -1734,22 +2024,28 @@ "1 False True False False \n", "2 False True False False \n", "\n", - " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_month Dates_day \\\n", - "Id ... \n", - "0 False True ... 5 10 \n", - "1 False True ... 5 10 \n", - "2 False False ... 5 10 \n", + " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_day \\\n", + "Id ... \n", + "0 False True ... 10 \n", + "1 False True ... 10 \n", + "2 False False ... 10 \n", "\n", - " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \n", + " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \\\n", "Id \n", - "0 23 59 0 1 1 0 0 0 \n", - "1 23 51 0 1 1 0 0 0 \n", - "2 23 50 0 1 1 0 0 0 \n", + "0 23 59 0 1 1 0 0 0 \n", + "1 23 51 0 1 1 0 0 0 \n", + "2 23 50 0 1 1 0 0 0 \n", "\n", - "[3 rows x 30 columns]" + " Address(encode) \n", + "Id \n", + "0 1525 \n", + "1 870 \n", + "2 1525 \n", + "\n", + "[3 rows x 31 columns]" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1775,7 +2071,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -1784,7 +2080,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 30)\n" + "(878049, 31)\n" ] }, { @@ -1795,6 +2091,7 @@ " \n", "3 rows × 30 columns
\n", + "3 rows × 31 columns
\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "0 1 13 23 53 5 0 \n", - "1 1 13 23 53 5 0 \n", - "2 1 13 23 33 5 0 \n", + " Address(encode) Awake Dates_day Dates_hour Dates_minute Dates_month \\\n", + "0 1525 1 13 23 53 5 \n", + "1 1525 1 13 23 53 5 \n", + "2 1525 1 13 23 33 5 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday ... \\\n", - "0 2015 False False False ... \n", - "1 2015 False False False ... \n", - "2 2015 False False False ... \n", + " Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday ... \\\n", + "0 0 2015 False False ... \n", + "1 0 2015 False False ... \n", + "2 0 2015 False False ... \n", "\n", " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "\n", - " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", - "0 False 0 1 0 -122.425892 37.774599 \n", - "1 False 0 1 0 -122.425892 37.774599 \n", - "2 False 0 1 0 -122.424363 37.800414 \n", + " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", + "0 False 0 1 0 -122.425892 37.774599 \n", + "1 False 0 1 0 -122.425892 37.774599 \n", + "2 False 0 1 0 -122.424363 37.800414 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1937,7 +2233,33 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<878049x1653 sparse matrix of type '3 rows × 30 columns
\n", + "3 rows × 31 columns
\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "Id \n", - "0 1 10 23 59 5 0 \n", - "1 1 10 23 51 5 0 \n", - "2 1 10 23 50 5 0 \n", + " Address(encode) Awake Dates_day Dates_hour Dates_minute Dates_month \\\n", + "Id \n", + "0 1525 1 10 23 59 5 \n", + "1 870 1 10 23 51 5 \n", + "2 1525 1 10 23 50 5 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday \\\n", - "Id \n", - "0 2015 False False False \n", - "1 2015 False False False \n", - "2 2015 False False False \n", + " Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday ... \\\n", + "Id ... \n", + "0 0 2015 False False ... \n", + "1 0 2015 False False ... \n", + "2 0 2015 False False ... \n", "\n", - " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", - "Id ... \n", - "0 ... False False False \n", - "1 ... False False False \n", - "2 ... False False False \n", - "\n", - " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", "Id \n", - "0 False False 0 1 0 -122.399588 \n", - "1 False False 0 1 0 -122.391523 \n", - "2 False False 0 1 0 -122.426002 \n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", "\n", - " Y \n", - "Id \n", - "0 37.735051 \n", - "1 37.732432 \n", - "2 37.792212 \n", + " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", + "Id \n", + "0 False 0 1 0 -122.399588 37.735051 \n", + "1 False 0 1 0 -122.391523 37.732432 \n", + "2 False 0 1 0 -122.426002 37.792212 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2525,31 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<884262x1653 sparse matrix of type '