import pandas as pd
import numpy as np
import pickle
import feather
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from joblib import dump, load
%matplotlib inline
#df.to_feather('chicago_crime_model_data.feather')
df = feather.read_dataframe('chicago_crime_model_data.feather')
I decided to drop several more variables which were somewhat redundant in the dataset.
df = df.drop(['index', 'domestic', 'beat', 'district', 'ward', 'communityarea'], axis = 1)
The features and labels were split before loading into the model.
y = df['arrest']
X = df.drop('arrest', axis = 1)
Due to the size of the dataset (~6.7 million rows) I decided to use a single validation set while testing my model. Using cross validation is the most reliable way to create a model. However, it is computationally expensive and would be time consuming to do so.
#Test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
The first model I ran was a logistic regression model. This model required that the data was scaled before it was added to the model. I used a sklearn pipeline to handle the scaling and running of the model.
clf_logistic_pipeline = Pipeline([('scale_train', StandardScaler()), ('lr', LogisticRegression())])
clf_logistic_pipeline.fit(X_train, y_train)
evaluate_model(clf_logistic_pipeline)
AUC for training set: 0.7819176581804969
AUC for validation set: 0.7817091792999603
AUC for test set: 0.7818304472777092
Score for training set: 0.8653519925078904
Score for validation set: 0.8651978977998159
Score for test set: 0.8653141251789568
Confusion Matrix:
[[1415779 45369]
[ 226796 332795]]
In order to use this trained model in the future, I saved the model using joblib.
dump(clf_logistic_pipeline, 'clf_logistic_pipeline.joblib')
#clf_logistic_pipeline = load('filename.joblib')
Logistic Regression allows for the betas to be analyzed. Each individual value of e^beta is the likelihood of an arrest of an individual crime as compared to all other crimes. I went ahead and looked at the top 10 crimes that had the highest probability of arrest as well as the bottom 10 crimes.
importance_list = []
for tup in zip(X_train.columns, np.exp(clf_logistic_pipeline.named_steps['lr'].coef_[0])):
importance_list.append(tup)
sorted_importance_list = sorted(importance_list, key=lambda tup: tup[1], reverse = True)
sorted_importance_list_reversed = sorted(importance_list, key=lambda tup: tup[1], reverse = False)
sorted_importance_list[0:10], sorted_importance_list_reversed[0:10]
([('NARCOTICS', 6.449520558275791),
('PROSTITUTION', 1.94090044673702),
('DEPARTMENT STORE', 1.3555689429902227),
('CRIMINAL TRESPASS', 1.3481573587031486),
('GAMBLING', 1.3177199360656247),
('GROCERY FOOD STORE', 1.3097096245166744),
('LIQUOR LAW VIOLATION', 1.2945964863184407),
('WEAPONS VIOLATION', 1.273162809482963),
('DRUG STORE', 1.1821135829524616),
('INTERFERENCE WITH PUBLIC OFFICER', 1.1708414586148312)],
[('THEFT', 0.509949611367234),
('CRIMINAL DAMAGE', 0.6028952015130741),
('BURGLARY', 0.6737965347995939),
('ROBBERY', 0.7442151529506927),
('MOTOR VEHICLE THEFT', 0.762481021645242),
('RESIDENCE', 0.8164103236220748),
('DECEPTIVE PRACTICE', 0.8252460436674496),
('RESIDENCE-GARAGE', 0.907916971075862),
('OTHER OFFENSE', 0.9155231101266241),
('BATTERY', 0.9170271875463851)])
The next model that I chose was a random forest. This model was chosen since it usually a good predictive model. In many cases the model is too good of a predictor on the training set and overfits the data. The forest usually needs some pruning to prevent overfitting.
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)
clf_rf = RandomForestClassifier(n_estimators = 50, max_depth = 10, min_samples_leaf = 2, oob_score=True, n_jobs=-1)
clf_rf.fit(X_train, y_train)
evaluate_model(clf_rf)
AUC for training set: 0.7378734801893847
AUC for validation set: 0.7380699441781224
AUC for test set: 0.7381047408102378
Score for training set: 0.8523847165569017
Score for validation set: 0.8523207198494469
Score for test set: 0.8525074242640934
Confusion Matrix:
[[1453149 7999]
[ 290045 269546]]
Random forests have a somewhat built in validation set called the out-of-bag score that is good to check to ensure the model is not overfitting.
clf_rf.oob_score_
Random Forests also have an important attribute called the feature importance score. Random forest classifiers split trees by the purity of the data so the feature importance is the number of times a feature shows up in a tree while weighting features higher that show up closer to the root node of the trees.
# You should create function for this and put it in a class
importance_list = []
for tup in zip(X_train.columns, clf_rf.feature_importances_):
importance_list.append(tup)
sorted_importance_list = sorted(importance_list, key=lambda tup: tup[1], reverse = True)
sorted_importance_list[0:10]
Narcotics is the most important feature. Since most narcotics arrests are the result of raids, it can be inferred intuitively that it would be an important feature in determining arrests. Similar intuition can be used for theft. Theft is common and most of the time there are no arrests for theft so if the crime is theft, there is not a high likelihood of arrest.
dump(clf_rf, 'clf_rf.joblib')
#clf_rd = load('filename.joblib')
I wanted to see if the hyperparameters for the random forest model could be further optimized. I used a grid searh over several models to see if the hyperparameters could be tuned.
rfc = RandomForestClassifier(n_jobs=-1)
parameters = {'n_estimators':[10,20,30], 'max_depth' : [3,7, 10, None], 'min_samples_leaf':[1,3,5,7]}
rfc_clf = GridSearchCV(rfc, parameters, cv=5)
%%time
rfc_clf.fit(X_train, y_train)
The best parameters were called for the model below.
rfc_clf.best_params
GridSearchCV(cv=5, error_score='raise',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid={'n_estimators': [10, 20, 30], 'max_depth': [3, 7, 10, None], 'min_samples_leaf': [1, 3, 5, 7]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
rfb = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
rfb.fit(X_train, y_train)
The next model that I tried was K nearest neighbors. This model is the slowest to run since the euclidean distance needs to be calculated for every point to every other point so it is computationally expensive.
clf_knn_pipeline = Pipeline([('scale_train', StandardScaler()), ('lr', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))])
clf_knn_pipeline.fit(X_train, y_train)
evaluate_model(clf_knn_pipeline)
pd.to_pickle(clf_knn_pipeline, 'knn_clf_pipeline.p')
nbrs = KNeighborsClassifier(n_neighbors=5)
%%time
nbrs.fit(X_train_scaled, y_train)
evaluate_model(nbrs, X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled)
#%% time
train_preds = nbrs.predict(X_train)
roc_auc_score(y_train, train_preds)
#test score
# n_neighbors=5 scores 0.7659
preds = nbrs.predict(X_val)
roc_auc_score(y_val, preds)
confusion_matrix(y_val, preds)
The mext model that I chose was gradient boosting.
gb_clf = GradientBoostingClassifier(learning_rate=0.01)
gb_clf.fit(X_train, y_train)
evaluate_model(gb_clf)
Since only 27.7% of crimes ended up with an arrest, I decided to balance the dataset and try running models on the balanced dataset.
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)
# Yay, balanced classes!
len(y_resampled), len(X_resampled)
rfb_balanced = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
rfb_balanced.fit(X_resampled, y_resampled)
preds = rfb_balanced.predict(X_val)
roc_auc_score(y_val, preds)
rfb_balanced.score(X_val, y_val)
rfb_balanced.score(X_test, y_test)
confusion_matrix(y_val, preds)
I then wanted to try to put several of the models together in an ensemble and run them.
model_list = [('lr', clf_logistic_pipeline), ('rf', clf_rf), ('gb', gb_clf)]
# create voting classifier
voting_classifer = VotingClassifier(estimators=model_list,
voting='hard', #<-- sklearn calls this hard voting
n_jobs=-1)
voting_classifer.fit(X_train, y_train)
evaluate_model(voting_classifer)
Overall the logistic regression model performed the best. These models could all be further tweaked for better performance, however, model with 88% accuracy predicting arrests for the city of Chicago is a good model.
Next:
Back to Top