diff --git a/delivers/baseline-script.html b/delivers/baseline-script.html new file mode 100644 index 0000000..1aa34bd --- /dev/null +++ b/delivers/baseline-script.html @@ -0,0 +1,14209 @@ + + +
+Baseline script. Hope this helps.
+ +import numpy as np
+import pandas as pd
+train = pd.read_csv("../data/train.csv", parse_dates=["Dates"])
+
+train.drop("Resolution", axis=1, inplace=True)
+train.drop("Descript", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+test = pd.read_csv("../data/test.csv", parse_dates=["Dates"])
+
+print(test.shape)
+test.head(3)
+sample = pd.read_csv("../data/sampleSubmission.csv", index_col="Id")
+
+print(sample.shape)
+sample.head(3)
+combi = pd.concat([train, test])
+
+print(combi.shape)
+combi.head(3)
+print(combi["DayOfWeek"].unique())
+
+day_of_week_dataframe = pd.get_dummies(combi["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)
+print(day_of_week_dataframe.shape)
+day_of_week_dataframe.head(3)
+combi = pd.concat([combi, day_of_week_dataframe], axis=1)
+combi.drop("DayOfWeek", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+print(combi["PdDistrict"].unique())
+
+pd_district_dataframe = pd.get_dummies(combi["PdDistrict"], prefix="PdDistrict").astype(np.bool)
+
+print(pd_district_dataframe.shape)
+pd_district_dataframe.head(3)
+combi = pd.concat([combi, pd_district_dataframe], axis=1)
+combi.drop("PdDistrict", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+def get_season(x):
+ summer=0
+ fall=0
+ winter=0
+ spring=0
+ if (x in [5, 6, 7]):
+ summer=1
+ if (x in [8, 9, 10]):
+ fall=1
+ if (x in [11, 0, 1]):
+ winter=1
+ if (x in [2, 3, 4]):
+ spring=1
+ return summer, fall, winter, spring
+combi["Dates_year"] = combi["Dates"].dt.year
+combi["Dates_month"] = combi["Dates"].dt.month
+combi["Dates_day"] = combi["Dates"].dt.day
+combi["Dates_hour"] = combi["Dates"].dt.hour
+combi["Dates_minute"] = combi["Dates"].dt.minute
+combi["Dates_second"] = combi["Dates"].dt.second
+combi["Awake"] = combi["Dates_hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
+combi["Summer"], combi["Fall"], combi["Winter"], combi["Spring"]=zip(*combi["Dates_month"].apply(get_season))
+
+combi.drop("Dates", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+from sklearn.preprocessing import LabelEncoder
+
+combi["Address_Type"] = np.nan
+
+combi.loc[combi["Address"].str.contains("Block of"), "Address_Type"] = "Block"
+combi.loc[combi["Address"].str.contains("/"), "Address_Type"] = "CrossRoad"
+
+encoder = LabelEncoder()
+combi["Address_Type(encode)"] = encoder.fit_transform(combi["Address_Type"])
+
+combi.head(3)
+combi.drop("Address", axis=1, inplace=True)
+
+print(combi.shape)
+combi.head(3)
+train = combi[combi["Category"].notnull()]
+
+train.drop("Id", axis=1, inplace=True)
+
+print(train.shape)
+train.head(3)
+test = combi[combi["Category"].isnull()]
+
+test["Id"] = test["Id"].astype(np.int32)
+test.drop("Category", axis=1, inplace=True)
+
+test.set_index("Id", inplace=True)
+
+print(test.shape)
+test.head(3)
+exclude_columns = ["Address_Type"]
+
+label_name = "Category"
+feature_names = train.columns.difference([label_name] + exclude_columns)
+
+X_train = train[feature_names]
+
+print(X_train.shape)
+X_train.head(3)
+y_train = train[label_name]
+
+print(y_train.shape)
+y_train.head(3)
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+kfold = StratifiedKFold(y_train, n_folds=6)
+
+model = BernoulliNB()
+%time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()
+score = -1.0 * score
+
+print("Use BernoulliNB. Score = {0:.6f}".format(score))
+X_test = test[feature_names]
+
+print(X_test.shape)
+X_test.head(3)
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import cross_val_score, StratifiedKFold
+
+model = BernoulliNB()
+model.fit(X_train, y_train)
+
+prediction = model.predict_proba(X_test)
+
+print(prediction.shape)
+prediction[:1]
+submission = pd.DataFrame(prediction, index=X_test.index, columns = sample.columns)
+submission = submission.reindex_axis(sorted(submission.columns), axis=1,)
+
+print(submission.shape)
+submission.head(3)
+from datetime import datetime
+
+current_time = datetime.now()
+current_time = current_time.strftime("%Y%m%d%H%M%S")
+
+description = "add-address-type"
+csv_filename = "../submissions/" + current_time + "_" + description + ".csv"
+
+submission.to_csv(csv_filename)
+import gzip
+
+gzip_filename = csv_filename + ".gz"
+
+f_in = open(csv_filename, "rb")
+
+f_out = gzip.open(gzip_filename, 'wb')
+f_out.writelines(f_in)
+f_out.close()
+
+f_in.close()
+| \n", + " | Address | \n", + "Category | \n", + "Id | \n", + "X | \n", + "Y | \n", + "DayOfWeek_Friday | \n", + "DayOfWeek_Monday | \n", + "DayOfWeek_Saturday | \n", + "DayOfWeek_Sunday | \n", + "DayOfWeek_Thursday | \n", + "... | \n", + "Dates_hour | \n", + "Dates_minute | \n", + "Dates_second | \n", + "Awake | \n", + "Summer | \n", + "Fall | \n", + "Winter | \n", + "Spring | \n", + "Address_Type | \n", + "Address_Type(encode) | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "OAK ST / LAGUNA ST | \n", + "WARRANTS | \n", + "NaN | \n", + "-122.425892 | \n", + "37.774599 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "23 | \n", + "53 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "CrossRoad | \n", + "1 | \n", + "
| 1 | \n", + "OAK ST / LAGUNA ST | \n", + "OTHER OFFENSES | \n", + "NaN | \n", + "-122.425892 | \n", + "37.774599 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "23 | \n", + "53 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "CrossRoad | \n", + "1 | \n", + "
| 2 | \n", + "VANNESS AV / GREENWICH ST | \n", + "OTHER OFFENSES | \n", + "NaN | \n", + "-122.424363 | \n", + "37.800414 | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "False | \n", + "... | \n", + "23 | \n", + "33 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "CrossRoad | \n", + "1 | \n", + "
3 rows × 35 columns
\n", + "3 rows × 32 columns
\n", + "3 rows × 34 columns
\n", "" ], "text/plain": [ @@ -1361,20 +1532,20 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Tuesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 False ... 5 13 23 53 \n", - "1 False ... 5 13 23 53 \n", - "2 False ... 5 13 23 33 \n", + " DayOfWeek_Tuesday ... Dates_hour Dates_minute Dates_second \\\n", + "0 False ... 23 53 0 \n", + "1 False ... 23 53 0 \n", + "2 False ... 23 33 0 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Awake Summer Fall Winter Spring Address_Type Address_Type(encode) \n", + "0 1 1 0 0 0 CrossRoad 1 \n", + "1 1 1 0 0 0 CrossRoad 1 \n", + "2 1 1 0 0 0 CrossRoad 1 \n", "\n", - "[3 rows x 32 columns]" + "[3 rows x 34 columns]" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1388,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -1397,14 +1568,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 31)\n" + "(878049, 33)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", @@ -1430,8 +1601,6 @@ "3 rows × 31 columns
\n", + "3 rows × 33 columns
\n", "" ], "text/plain": [ @@ -1531,20 +1702,25 @@ "1 False False False False \n", "2 False False False False \n", "\n", - " DayOfWeek_Wednesday ... Dates_month Dates_day Dates_hour Dates_minute \\\n", - "0 True ... 5 13 23 53 \n", - "1 True ... 5 13 23 53 \n", - "2 True ... 5 13 23 33 \n", + " DayOfWeek_Wednesday ... Dates_hour Dates_minute \\\n", + "0 True ... 23 53 \n", + "1 True ... 23 53 \n", + "2 True ... 23 33 \n", "\n", - " Dates_second Awake Summer Fall Winter Spring \n", - "0 0 1 1 0 0 0 \n", - "1 0 1 1 0 0 0 \n", - "2 0 1 1 0 0 0 \n", + " Dates_second Awake Summer Fall Winter Spring Address_Type \\\n", + "0 0 1 1 0 0 0 CrossRoad \n", + "1 0 1 1 0 0 0 CrossRoad \n", + "2 0 1 1 0 0 0 CrossRoad \n", "\n", - "[3 rows x 31 columns]" + " Address_Type(encode) \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "\n", + "[3 rows x 33 columns]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1560,29 +1736,35 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " app.launch_new_instance()\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 32)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " app.launch_new_instance()\n", - "/Users/tantara/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", + "/root/.pyenv/versions/3.5.2/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" @@ -1607,8 +1789,6 @@ "3 rows × 30 columns
\n", + "3 rows × 32 columns
\n", "" ], "text/plain": [ @@ -1734,22 +1916,28 @@ "1 False True False False \n", "2 False True False False \n", "\n", - " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_month Dates_day \\\n", - "Id ... \n", - "0 False True ... 5 10 \n", - "1 False True ... 5 10 \n", - "2 False False ... 5 10 \n", + " DayOfWeek_Wednesday PdDistrict_BAYVIEW ... Dates_hour \\\n", + "Id ... \n", + "0 False True ... 23 \n", + "1 False True ... 23 \n", + "2 False False ... 23 \n", "\n", - " Dates_hour Dates_minute Dates_second Awake Summer Fall Winter Spring \n", - "Id \n", - "0 23 59 0 1 1 0 0 0 \n", - "1 23 51 0 1 1 0 0 0 \n", - "2 23 50 0 1 1 0 0 0 \n", + " Dates_minute Dates_second Awake Summer Fall Winter Spring Address_Type \\\n", + "Id \n", + "0 59 0 1 1 0 0 0 Block \n", + "1 51 0 1 1 0 0 0 CrossRoad \n", + "2 50 0 1 1 0 0 0 Block \n", + "\n", + " Address_Type(encode) \n", + "Id \n", + "0 0 \n", + "1 1 \n", + "2 0 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 32 columns]" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1775,7 +1963,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -1784,7 +1972,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(878049, 30)\n" + "(878049, 31)\n" ] }, { @@ -1795,6 +1983,7 @@ " \n", "3 rows × 30 columns
\n", + "3 rows × 31 columns
\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "0 1 13 23 53 5 0 \n", - "1 1 13 23 53 5 0 \n", - "2 1 13 23 33 5 0 \n", + " Address_Type(encode) Awake Dates_day Dates_hour Dates_minute \\\n", + "0 1 1 13 23 53 \n", + "1 1 1 13 23 53 \n", + "2 1 1 13 23 33 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday ... \\\n", - "0 2015 False False False ... \n", - "1 2015 False False False ... \n", - "2 2015 False False False ... \n", + " Dates_month Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday \\\n", + "0 5 0 2015 False False \n", + "1 5 0 2015 False False \n", + "2 5 0 2015 False False \n", "\n", - " PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL \\\n", - "0 False False False False \n", - "1 False False False False \n", - "2 False False False False \n", + " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", + "0 ... False False False \n", + "1 ... False False False \n", + "2 ... False False False \n", "\n", - " PdDistrict_TENDERLOIN Spring Summer Winter X Y \n", - "0 False 0 1 0 -122.425892 37.774599 \n", - "1 False 0 1 0 -122.425892 37.774599 \n", - "2 False 0 1 0 -122.424363 37.800414 \n", + " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + "0 False False 0 1 0 -122.425892 \n", + "1 False False 0 1 0 -122.425892 \n", + "2 False False 0 1 0 -122.424363 \n", "\n", - "[3 rows x 30 columns]" + " Y \n", + "0 37.774599 \n", + "1 37.774599 \n", + "2 37.800414 \n", + "\n", + "[3 rows x 31 columns]" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "exclude_columns = [\"Address_Type\"]\n", + "\n", "label_name = \"Category\"\n", - "feature_names = train.columns.difference([label_name])\n", + "feature_names = train.columns.difference([label_name] + exclude_columns)\n", "\n", "X_train = train[feature_names]\n", "\n", @@ -1937,7 +2132,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -1958,7 +2153,7 @@ "Name: Category, dtype: object" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1979,7 +2174,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "collapsed": false }, @@ -1988,9 +2183,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 55.2 s, sys: 5.56 s, total: 1min\n", - "Wall time: 1min\n", - "Use BernoulliNB. Score = 2.562140\n" + "CPU times: user 48.3 s, sys: 1.86 s, total: 50.2 s\n", + "Wall time: 48.9 s\n", + "Use BernoulliNB. Score = 2.517539\n" ] } ], @@ -2016,7 +2211,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -2025,7 +2220,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(884262, 30)\n" + "(884262, 31)\n" ] }, { @@ -2036,6 +2231,7 @@ " \n", "3 rows × 30 columns
\n", + "3 rows × 31 columns
\n", "" ], "text/plain": [ - " Awake Dates_day Dates_hour Dates_minute Dates_month Dates_second \\\n", - "Id \n", - "0 1 10 23 59 5 0 \n", - "1 1 10 23 51 5 0 \n", - "2 1 10 23 50 5 0 \n", + " Address_Type(encode) Awake Dates_day Dates_hour Dates_minute \\\n", + "Id \n", + "0 0 1 10 23 59 \n", + "1 1 1 10 23 51 \n", + "2 0 1 10 23 50 \n", "\n", - " Dates_year DayOfWeek_Friday DayOfWeek_Monday DayOfWeek_Saturday \\\n", - "Id \n", - "0 2015 False False False \n", - "1 2015 False False False \n", - "2 2015 False False False \n", + " Dates_month Dates_second Dates_year DayOfWeek_Friday DayOfWeek_Monday \\\n", + "Id \n", + "0 5 0 2015 False False \n", + "1 5 0 2015 False False \n", + "2 5 0 2015 False False \n", "\n", " ... PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN \\\n", "Id ... \n", @@ -2180,11 +2375,11 @@ "1 ... False False False \n", "2 ... False False False \n", "\n", - " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", + " PdDistrict_TARAVAL PdDistrict_TENDERLOIN Spring Summer Winter X \\\n", "Id \n", - "0 False False 0 1 0 -122.399588 \n", - "1 False False 0 1 0 -122.391523 \n", - "2 False False 0 1 0 -122.426002 \n", + "0 False False 0 1 0 -122.399588 \n", + "1 False False 0 1 0 -122.391523 \n", + "2 False False 0 1 0 -122.426002 \n", "\n", " Y \n", "Id \n", @@ -2192,10 +2387,10 @@ "1 37.732432 \n", "2 37.792212 \n", "\n", - "[3 rows x 30 columns]" + "[3 rows x 31 columns]" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2404,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -2224,22 +2419,22 @@ { "data": { "text/plain": [ - "array([[ 5.18022914e-03, 1.47306845e-01, 8.35309027e-05,\n", - " 7.15327047e-04, 2.89343463e-02, 2.17660902e-03,\n", - " 2.90469652e-03, 5.20272506e-02, 4.75338148e-03,\n", - " 4.53910576e-04, 1.50404661e-04, 6.33194320e-04,\n", - " 4.46497596e-03, 5.83197476e-03, 3.21888082e-04,\n", - " 3.66372241e-03, 9.46729466e-02, 1.63443121e-03,\n", - " 6.10377668e-04, 4.41001971e-02, 6.88112197e-02,\n", - " 2.06289853e-01, 6.30319698e-06, 3.51815041e-04,\n", - " 8.06585264e-03, 3.64767735e-02, 2.11525827e-03,\n", - " 2.31851049e-02, 3.90020973e-03, 1.39053149e-04,\n", - " 4.88590719e-03, 4.72963357e-04, 4.44303697e-02,\n", - " 8.03404588e-07, 7.22208250e-03, 5.50342776e-02,\n", - " 5.65854147e-02, 5.66664063e-02, 2.47400928e-02]])" + "array([[ 6.15661689e-03, 1.67585639e-01, 1.12483697e-04,\n", + " 8.81713446e-04, 4.06910454e-02, 2.18988921e-03,\n", + " 1.55177863e-03, 4.67223805e-02, 4.55686884e-03,\n", + " 6.30590722e-04, 1.89187179e-04, 8.40936473e-04,\n", + " 5.85516677e-03, 6.87678483e-03, 3.93153520e-04,\n", + " 4.27555899e-03, 9.08472440e-02, 1.59855130e-03,\n", + " 3.27831235e-04, 5.86817876e-02, 7.25539371e-02,\n", + " 1.53244251e-01, 7.10261876e-06, 1.05155595e-04,\n", + " 9.88284415e-03, 2.89483261e-02, 2.94895460e-03,\n", + " 2.85398838e-02, 4.64345647e-03, 1.71533199e-04,\n", + " 5.14001860e-03, 6.48117991e-04, 5.17240980e-02,\n", + " 1.00059338e-06, 9.78585378e-03, 6.09758187e-02,\n", + " 5.12109408e-02, 5.34540385e-02, 2.50494596e-02]])" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2259,7 +2454,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -2329,75 +2524,75 @@ " \n", "