library("psych")
library("caret")
library("pROC")
library("dplyr")
library("car")
library("kableExtra")
#devtools::session_info()
loc_train = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/logistic/crime-training-data_modified.csv "
loc_test = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/logistic/crime-evaluation-data_modified.csv"
train_df = read.csv(loc_train, stringsAsFactors = FALSE)
test_df = read.csv(loc_test, stringsAsFactors = FALSE)
MY_ROC = function(labels, scores,pname){
labels = labels[order(scores, decreasing=TRUE)]
result =data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
dFPR = c(diff(result$FPR), 0)
dTPR = c(diff(result$TPR), 0)
AUC = round(sum(result$TPR * dFPR) + sum(dTPR * dFPR)/2,4)
plot(result$FPR,result$TPR,type="l",main =paste0(pname," ROC Curve"),ylab="Sensitivity",xlab="1-Specificity")
abline(a=0,b=1)
legend(.6,.2,AUC,title = "AUC")
}
In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0).
Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or, variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:
A write-up submitted in PDF format. Your write-up should have four sections. Each one is described below. You may assume you are addressing me as a fellow data scientist, so do not need to shy away from technical details.
Assigned prediction (probabilities, classifications) for the evaluation data set. Use 0.5 threshold. Include your R statistical programming code in an Appendix.
Describe the size and the variables in the crime training data set. Consider that too much detail will cause a manager to lose interest while too little detail will make the manager consider that you aren’t doing your job. Some suggestions are given below. Please do NOT treat this as a check list of things to do to complete the assignment. You should have your own thoughts on what to tell the boss. These are just ideas.
The training dataset contains 466 cases. chas and target are dummy variables.
knitr::kable(round(describe(train_df),2), format = "latex", booktabs = T)
To visualize the data I plotted a histogram, box plot, and a boxplot depicking the target variable by color.
par(mfrow=c(2,3))
for (i in 1:2) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
par(mfrow=c(2,3))
for (i in 3:4) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
par(mfrow=c(2,3))
for (i in 5:6) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
par(mfrow=c(2,3))
for (i in 7:8) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
par(mfrow=c(2,3))
for (i in 9:10) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
par(mfrow=c(2,3))
for (i in 11:12) {
hist(train_df[,i],main=names(train_df[i]),51)
boxplot(train_df[,i], main=names(train_df[i]), type="l",horizontal = TRUE)
boxplot(train_df$target,train_df[,i], main=paste(names(train_df[i]),"By Target R0 & B1") , horizontal = TRUE,col=(c("blue","red")))
}
A correlation plot reveals several highly correlated variables. This might cause colinearity within the model.
library("corrplot")
cor_mx = cor(train_df ,use="pairwise.complete.obs", method = "pearson")
corrplot(cor_mx, method = "color",
type = "upper", order = "original", number.cex = .7,
addCoef.col = "black", # Add coefficient of correlation
tl.col = "black", tl.srt = 90, # Text label color and rotation
# hide correlation coefficient on the principal diagonal
diag = TRUE)
Describe how you have transformed the data by changing the original variables or creating new variables. If you did transform the data or create new variables, discuss why you did this. Here are some possible transformations.
I will transform the chas and target variables to factors since the column is a dummy variable. The variable transformation will be performed within the training model using scaled, centered or BoxCox transformations. I will use the PredictionFunction from the caret packet to ensure these transformations are applied to the test dataset when predicting the outcomes. Finally, I will select three sample data set from my training data to compare my predictions in the confusion matrix.
levels(train_df$target) =make.names(levels(factor(train_df$target)))
train_df$chas = as.factor(train_df$chas)
test_df$chas= as.factor(test_df$chas)
test_df$target = NULL
set.seed(12)
ACT_Train1 =train_df %>% sample_n(., 40)
ACT_Train1$chas = as.factor(ACT_Train1$chas)
set.seed(30)
ACT_Train2 =train_df %>% sample_n(., 40)
ACT_Train2$chas = as.factor(ACT_Train2$chas)
set.seed(50)
ACT_Train3 =train_df %>% sample_n(., 40)
ACT_Train3$chas = as.factor(ACT_Train3$chas)
Using the training data, build at least three different binary logistic regression models, using different variables (or the same variables with different transformations). You may select the variables manually, use an approach such as Forward or Stepwise, use a different approach, or use a combination of techniques. Describe the techniques you used. If you manually selected a variable for inclusion into the model or exclusion into the model, indicate why this was done. Be sure to explain how you can make inferences from the model, as well as discuss other relevant model output. Discuss the coefficients in the models, do they make sense? Are you keeping the model even though it is counter intuitive? Why? The boss needs to know.
Model 1 was selected with all variables and did not pay attention to collinearity. The model will apply centering and scaling to the variables and will cross-validate 10 times. Variable medv had high colinearity will remove in next models. All models will follow predictions and comparison of the predictions to the training target from the random sample. Model 1’s accuracy of .55 and an AUC of .45.
Model1 = train(target ~., data = train_df %>% select(-chas),
method = "glm", family = "binomial",
preProcess = c("center", "scale"),trControl = trainControl(
method = "cv", number = 10,
savePredictions = TRUE#,
#classProbs=TRUE
),tuneLength = 5)
vif(Model1$finalModel)
## zn indus nox rm age dis rad tax
## 1.805778 2.521246 3.962764 5.615454 2.547415 3.865202 1.903582 2.043669
## ptratio lstat medv
## 2.209140 2.515524 7.960463
summary(Model1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8538 -0.1411 -0.0014 0.0026 3.4667
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4306 0.7146 3.401 0.00067 ***
## zn -1.6901 0.8105 -2.085 0.03704 *
## indus -0.3563 0.3134 -1.137 0.25561
## nox 5.5221 0.8991 6.142 8.15e-10 ***
## rm -0.4308 0.5089 -0.847 0.39721
## age 0.9875 0.3895 2.535 0.01123 *
## dis 1.5095 0.4812 3.137 0.00171 **
## rad 6.2267 1.3971 4.457 8.32e-06 ***
## tax -1.1575 0.4861 -2.381 0.01726 *
## ptratio 0.8301 0.2721 3.050 0.00229 **
## lstat 0.3822 0.3768 1.014 0.31045
## medv 1.6952 0.6321 2.682 0.00733 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 193.52 on 454 degrees of freedom
## AIC: 217.52
##
## Number of Fisher Scoring iterations: 9
Model1_Pred =predictionFunction(method = Model1$modelInfo,
modelFit = Model1$finalModel, newdata = test_df,
preProc = Model1$preProcess)
Model1_Pred = ifelse(Model1_Pred>.50,1,0)
confusionMatrix(factor(Model1_Pred),factor(ACT_Train1$target),dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12 8
## 1 10 10
##
## Accuracy : 0.55
## 95% CI : (0.3849, 0.7074)
## No Information Rate : 0.55
## P-Value [Acc > NIR] : 0.5651
##
## Kappa : 0.1
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.5455
## Specificity : 0.5556
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.5000
## Prevalence : 0.5500
## Detection Rate : 0.3000
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.5505
##
## 'Positive' Class : 0
##
#xtract predicted values to plot roc
Model1_Pred_Prob =Model1$pred
Model1_Pred_Samp = subset(Model1_Pred_Prob, as.character(Model1_Pred_Prob$rowIndex) %in% rownames(ACT_Train1))
Model1_Pred_Samp$FProb = ifelse(Model1_Pred_Samp$pred>.50,1,0)
MY_ROC(as.numeric(ACT_Train1$target),Model1_Pred_Samp$FProb,"Model2")
plot(Model1$finalModel)
I will maintain statistically significant variables with a confidence level of 95. The variables will be scaled based on centering and scaling. This model has excellent performance compared to the other models with an accuracy .60. The AUC is .66.
Model2 = train(target ~., data = train_df %>% select(-medv,-chas,-indus,-lstat),
method = "glm", family = "binomial",
preProcess = c("center", "scale"),trControl = trainControl(
method = "cv",
number = 10,
savePredictions = TRUE#,
#classProbs=TRUE
),tuneLength = 5)
vif(Model2$finalModel)
## zn nox rm age dis rad tax ptratio
## 1.670350 2.884366 1.374048 1.436813 2.910479 1.699014 1.694409 1.501079
summary(Model2)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8945 -0.1982 -0.0028 0.0037 3.2577
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4609 0.6741 3.651 0.000262 ***
## zn -1.4577 0.7082 -2.058 0.039555 *
## nox 4.6377 0.7354 6.306 2.86e-10 ***
## rm 0.5229 0.2385 2.192 0.028370 *
## age 0.6177 0.2783 2.220 0.026439 *
## dis 1.0108 0.4091 2.471 0.013489 *
## rad 6.2537 1.2836 4.872 1.10e-06 ***
## tax -1.4454 0.4417 -3.273 0.001065 **
## ptratio 0.4460 0.2135 2.089 0.036721 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 203.54 on 457 degrees of freedom
## AIC: 221.54
##
## Number of Fisher Scoring iterations: 9
Model2_Pred =predictionFunction(method = Model2$modelInfo,
modelFit = Model2$finalModel, newdata = test_df,
preProc = Model2$preProcess)
Model2_Pred = ifelse(Model2_Pred>.50,1,0)
confusionMatrix(factor(Model2_Pred),factor(ACT_Train2$target),dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11 8
## 1 8 13
##
## Accuracy : 0.6
## 95% CI : (0.4333, 0.7514)
## No Information Rate : 0.525
## P-Value [Acc > NIR] : 0.2148
##
## Kappa : 0.198
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5789
## Specificity : 0.6190
## Pos Pred Value : 0.5789
## Neg Pred Value : 0.6190
## Prevalence : 0.4750
## Detection Rate : 0.2750
## Detection Prevalence : 0.4750
## Balanced Accuracy : 0.5990
##
## 'Positive' Class : 0
##
Model2_Pred_Prob =Model2$pred
Model2_Pred_Samp = subset(Model2_Pred_Prob, as.character(Model2_Pred_Prob$rowIndex) %in% rownames(ACT_Train2))
Model2_Pred_Samp$FProb = ifelse(Model2_Pred_Samp$pred>.50,1,0)
MY_ROC(as.numeric(ACT_Train2$target),Model2_Pred_Samp$FProb,"Model2")
plot(Model2$finalModel)
The variable medv will be removed due to, and a BoxCox transformation will be applied to the data. The accuracy of this model is .50 and AUC is .47
Model3 = train(target ~., data = train_df,
method = "glm", family = "binomial",
preProcess = "BoxCox",trControl = trainControl(
method = "cv", number = 10,
savePredictions = TRUE#,
#classProbs=TRUE
),tuneLength = 5)
vif(Model3$finalModel)
## zn indus chas1 nox rm age dis rad
## 1.478284 2.873271 1.283693 3.946585 4.618582 2.587742 3.683383 2.176111
## tax ptratio lstat medv
## 2.788049 2.492968 3.778851 7.806063
Model3 = train(target ~., data = train_df %>% select(-medv,-indus,-lstat,-zn,-chas),
method = "glm", family = "binomial",
preProcess = "BoxCox",trControl = trainControl(
method = "cv", number = 10,
savePredictions = TRUE
#classProbs=TRUE
),tuneLength = 5)
summary(Model3)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.85957 -0.19588 -0.00324 0.10167 3.07338
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 52.181688 31.688034 1.647 0.09961 .
## nox 12.215774 1.884958 6.481 9.13e-11 ***
## rm 3.760475 1.465400 2.566 0.01028 *
## age 0.006777 0.002862 2.368 0.01789 *
## dis 2.018190 0.708140 2.850 0.00437 **
## rad 3.157203 0.650091 4.857 1.19e-06 ***
## tax -32.647557 16.551092 -1.973 0.04855 *
## ptratio 0.014092 0.005091 2.768 0.00564 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 211.09 on 458 degrees of freedom
## AIC: 227.09
##
## Number of Fisher Scoring iterations: 8
Model3_Pred =predictionFunction(method = Model3$modelInfo,
modelFit = Model3$finalModel, newdata = test_df,
preProc = Model3$preProcess)
Model3_Pred = ifelse(Model3_Pred>.50,1,0)
confusionMatrix(factor(Model3_Pred),factor(ACT_Train3$target),dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 12
## 1 8 12
##
## Accuracy : 0.5
## 95% CI : (0.338, 0.662)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.9256
##
## Kappa : 0
## Mcnemar's Test P-Value : 0.5023
##
## Sensitivity : 0.5
## Specificity : 0.5
## Pos Pred Value : 0.4
## Neg Pred Value : 0.6
## Prevalence : 0.4
## Detection Rate : 0.2
## Detection Prevalence : 0.5
## Balanced Accuracy : 0.5
##
## 'Positive' Class : 0
##
Model3_Pred_Prob =Model3$pred
Model3_Pred_Samp = subset(Model3_Pred_Prob, as.character(Model3_Pred_Prob$rowIndex) %in% rownames(ACT_Train3))
Model3_Pred_Samp$FProb = ifelse(Model3_Pred_Samp$pred>.50,1,0)
MY_ROC(as.numeric(ACT_Train3$target),Model3_Pred_Samp$FProb,"Model3")
plot(Model3$finalModel)
Following the somewhat success of the second model, I decided to build a fourth model using high confidence. This model will follow a transformation of centering and scaling. I will evaluate with the same data set used in model two. The model performed worst than the second model with an accuracy of .45 and an AUC of .51.
Model4 = train(target ~., data = train_df %>% select(-medv,-chas,-indus,-lstat,-zn,-rm,-dis,-age,-ptratio),
method = "glm", family = "binomial",
preProcess = c("center", "scale"),trControl = trainControl(
method = "cv",
number = 10,
savePredictions = TRUE#,
#classProbs=TRUE
),tuneLength = 5)
vif(Model4$finalModel)
## nox rad tax
## 1.743826 1.294781 1.543001
summary(Model4)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.89721 -0.27798 -0.03997 0.00557 2.55954
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.6256 0.5776 4.546 5.47e-06 ***
## nox 4.1572 0.5278 7.877 3.35e-15 ***
## rad 5.5385 1.0375 5.338 9.38e-08 ***
## tax -1.3677 0.3916 -3.493 0.000478 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 224.47 on 462 degrees of freedom
## AIC: 232.47
##
## Number of Fisher Scoring iterations: 8
Model4_Pred =predictionFunction(method = Model4$modelInfo,
modelFit = Model4$finalModel, newdata = test_df,
preProc = Model4$preProcess)
Model4_Pred = ifelse(Model4_Pred>.50,1,0)
confusionMatrix(factor(Model4_Pred),factor(ACT_Train2$target),dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9 12
## 1 10 9
##
## Accuracy : 0.45
## 95% CI : (0.2926, 0.6151)
## No Information Rate : 0.525
## P-Value [Acc > NIR] : 0.8661
##
## Kappa : -0.0973
## Mcnemar's Test P-Value : 0.8312
##
## Sensitivity : 0.4737
## Specificity : 0.4286
## Pos Pred Value : 0.4286
## Neg Pred Value : 0.4737
## Prevalence : 0.4750
## Detection Rate : 0.2250
## Detection Prevalence : 0.5250
## Balanced Accuracy : 0.4511
##
## 'Positive' Class : 0
##
Model4_Pred_Prob =Model4$pred
Model4_Pred_Samp = subset(Model4_Pred_Prob, as.character(Model4_Pred_Prob$rowIndex) %in% rownames(ACT_Train2))
Model4_Pred_Samp$FProb = ifelse(Model4_Pred_Samp$pred>.50,1,0)
MY_ROC(as.numeric(ACT_Train2$target),Model4_Pred_Samp$FProb,"Model4")
plot(Model4$finalModel)
Decide on the criteria for selecting the best binary logistic regression model. Will you select models with slightly worse performance if it makes more sense or is more parsimonious? Discuss why you selected your model.
My best model was model two, and I decided based on accuracy and AUC. All the models were predicted and compared to a random 40 case sample. I like AUC measure because it can be translated to a letter grade model two had an AUC of .66 which can be translated into a d+. The performace of the model I think is low because of the small testing dataset.
Model1_con = confusionMatrix(factor(Model1_Pred),factor(ACT_Train1$target),dnn = c("Prediction", "Reference"))
Model2_con = confusionMatrix(factor(Model2_Pred),factor(ACT_Train2$target),dnn = c("Prediction", "Reference"))
Model3_con = confusionMatrix(factor(Model3_Pred),factor(ACT_Train3$target),dnn = c("Prediction", "Reference"))
Model4_con=confusionMatrix(factor(Model4_Pred),factor(ACT_Train2$target),dnn = c("Prediction", "Reference"))
Comp= data.frame(Model1_con$byClass,Model2_con$byClass,Model3_con$byClass,Model4_con$byClass) %>% slice(c(1,2,5,7 ))
row.names(Comp) = c("Sensitivity", "Specificity", "Precision", "F1")
colnames(Comp) = c("Model1","Model2","Model3","Model4")
knitr::kable(Comp, format = "latex", booktabs = T)
MY_ROC(as.numeric(ACT_Train1$target),Model1_Pred_Samp$FProb,"Model1")
MY_ROC(as.numeric(ACT_Train2$target),Model2_Pred_Samp$FProb,"Model2")
MY_ROC(as.numeric(ACT_Train3$target),Model3_Pred_Samp$FProb,"Model3")
MY_ROC(as.numeric(ACT_Train2$target),Model4_Pred_Samp$FProb,"Model4")