zn: proportion of residential land zoned for large lots (over 25000 square feet) (predictor variable)
indus: proportion of non-retail business acres per suburb (predictor variable)
chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) (predictor variable)
nox: nitrogen oxides concentration (parts per 10 million) (predictor variable)
rm: average number of rooms per dwelling (predictor variable)
age: proportion of owner-occupied units built prior to 1940 (predictor variable)
dis: weighted mean of distances to five Boston employment centers (predictor variable)
rad: index of accessibility to radial highways (predictor variable)
tax: full-value property-tax rate per $10,000 (predictor variable)
ptratio: pupil-teacher ratio by town (predictor variable)
black: 1000(Bk - 0.63)2 where Bk is the proportion of blacks by town (predictor variable)
lstat: lower status of the population (percent) (predictor variable)
medv: median value of owner-occupied homes in $1000s (predictor variable)
target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.6.2
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
crime_raw <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-3/Data/crime-training-data_modified.csv",
header = TRUE)
print(dim(crime_raw))
## [1] 466 13
print(str(crime_raw))
## 'data.frame': 466 obs. of 13 variables:
## $ zn : num 0 0 0 30 0 0 0 0 0 80 ...
## $ indus : num 19.58 19.58 18.1 4.93 2.46 ...
## $ chas : int 0 1 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
## $ rm : num 7.93 5.4 6.49 6.39 7.16 ...
## $ age : num 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
## $ dis : num 2.05 1.32 1.98 7.04 2.7 ...
## $ rad : int 5 5 24 6 3 5 24 24 5 1 ...
## $ tax : int 403 403 666 300 193 384 666 666 224 315 ...
## $ ptratio: num 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
## $ lstat : num 3.7 26.82 18.85 5.19 4.82 ...
## $ medv : num 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
## $ target : int 1 1 1 0 0 0 1 1 0 0 ...
## NULL
print(summary(crime_raw))
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4914
## 3rd Qu.:1.0000
## Max. :1.0000
print(head(crime_raw))
## zn indus chas nox rm age dis rad tax ptratio lstat medv target
## 1 0 19.58 0 0.605 7.929 96.2 2.0459 5 403 14.7 3.70 50.0 1
## 2 0 19.58 1 0.871 5.403 100.0 1.3216 5 403 14.7 26.82 13.4 1
## 3 0 18.10 0 0.740 6.485 100.0 1.9784 24 666 20.2 18.85 15.4 1
## 4 30 4.93 0 0.428 6.393 7.8 7.0355 6 300 16.6 5.19 23.7 0
## 5 0 2.46 0 0.488 7.155 92.2 2.7006 3 193 17.8 4.82 37.9 0
## 6 0 8.56 0 0.520 6.781 71.3 2.8561 5 384 20.9 7.67 26.5 0
### Checking for NAs
any(is.na(crime_raw))
## [1] FALSE
Basic data manipulations:
crime_raw$target <- as.factor(crime_raw$target)
crime_raw$chas <- as.factor(crime_raw$chas)
table(crime_raw$target)
##
## 0 1
## 237 229
table(crime_raw$chas)
##
## 0 1
## 433 33
ggplot(crime_raw, aes(chas)) + geom_bar(aes(fill=chas))
ggplot(crime_raw, aes(target)) + geom_bar(aes(fill=target))
As we see above, the dataset is a balanced one.
## Box plots:
#gb1 <- boxplot(crime_raw$zn)
#gb2 <- boxplot(crime_raw$indus)
gb1 <- ggplot(data = crime_raw, aes(y = zn)) + geom_boxplot()
gb2 <- ggplot(data = crime_raw, aes(y = indus)) + geom_boxplot()
gb3 <- ggplot(data = crime_raw, aes(y = nox)) + geom_boxplot()
gb4 <- ggplot(data = crime_raw, aes(y = rm)) + geom_boxplot()
gb5 <- ggplot(data = crime_raw, aes(y = age)) + geom_boxplot()
gb6 <- ggplot(data = crime_raw, aes(y = dis)) + geom_boxplot()
gb7 <- ggplot(data = crime_raw, aes(y = rad)) + geom_boxplot()
gb8 <- ggplot(data = crime_raw, aes(y = tax)) + geom_boxplot()
gb9 <- ggplot(data = crime_raw, aes(y = ptratio)) + geom_boxplot()
gb10 <- ggplot(data = crime_raw, aes(y = lstat)) + geom_boxplot()
gb11 <- ggplot(data = crime_raw, aes(y = medv)) + geom_boxplot()
plot_grid(gb1, gb2, gb3, gb4, gb5, gb6, gb7, gb8, gb9, gb10, gb11, labels = "AUTO, scale = 10")
gb12 <- ggplot(data = crime_raw, aes(x = target, y = ptratio)) + geom_boxplot()
gb13 <- ggplot(data = crime_raw, aes(x = target, y = zn)) + geom_boxplot()
gb14 <- ggplot(data = crime_raw, aes(x = target, y = nox)) + geom_boxplot()
gb15 <- ggplot(data = crime_raw, aes(x = target, y = rm)) + geom_boxplot()
gb16 <- ggplot(data = crime_raw, aes(x = target, y = age)) + geom_boxplot()
gb17 <- ggplot(data = crime_raw, aes(x = target, y = dis)) + geom_boxplot()
gb18 <- ggplot(data = crime_raw, aes(x = target, y = rad)) + geom_boxplot()
gb19 <- ggplot(data = crime_raw, aes(x = target, y = tax)) + geom_boxplot()
gb20 <- ggplot(data = crime_raw, aes(x = target, y = lstat)) + geom_boxplot()
gb21 <- ggplot(data = crime_raw, aes(x = target, y = medv)) + geom_boxplot()
plot_grid(gb12, gb13, gb14, gb15, gb16, gb17, gb18, gb19, gb20, gb21, labels = "AUTO, scale = 10")
crime_raw %>%
gather(variable, value, zn:indus, nox:medv) %>%
ggplot(., aes(value)) +
geom_density(fill = "dodgerblue4", color="dodgerblue4") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
crime_raw$chas <- as.numeric(as.character(crime_raw$chas))
corrMatrix <- round(cor(crime_raw %>% select(-target)),4)
corrMatrix %>% corrplot(., method = "color", outline = T, addgrid.col = "darkgray", order="hclust", addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", tl.cex = 1.0, cl.cex = 1.0, addCoef.col = "white", number.digits = 2, number.cex = 0.8, col = colorRampPalette(c("darkred","white","dodgerblue4"))(100))
The correlation between the 2 variables - rad and tax is very high (0.91) as we see from the above plot
#### SPLIT INTO TRAIN/TEST
n <- nrow(crime_raw)
set.seed(123)
crime_raw_random <- crime_raw[sample(nrow(crime_raw)), ]
crime.train.df <- crime_raw_random[1:as.integer(0.7*n),]
crime.test.df <- crime_raw_random[as.integer(0.7*n +1):n, ]
table(crime.test.df$target) / nrow(crime.test.df)
##
## 0 1
## 0.45 0.55
table(crime.train.df$target) / nrow(crime.train.df)
##
## 0 1
## 0.5337423 0.4662577
logitModel1 <- glm(target~., data = crime.train.df,
family = binomial(link = "logit"))
summary(logitModel1)
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8819 -0.2986 -0.0027 0.0029 3.5214
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -41.983392 8.105219 -5.180 2.22e-07 ***
## zn -0.087468 0.045684 -1.915 0.05554 .
## indus -0.057692 0.056550 -1.020 0.30764
## chas 0.156800 0.907434 0.173 0.86281
## nox 50.109343 9.730940 5.149 2.61e-07 ***
## rm -0.236922 0.885677 -0.268 0.78908
## age 0.016422 0.015418 1.065 0.28684
## dis 0.796156 0.267080 2.981 0.00287 **
## rad 0.618006 0.194650 3.175 0.00150 **
## tax -0.004061 0.003410 -1.191 0.23376
## ptratio 0.344624 0.140971 2.445 0.01450 *
## lstat 0.048575 0.066647 0.729 0.46610
## medv 0.183195 0.084705 2.163 0.03056 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 141.31 on 313 degrees of freedom
## AIC: 167.31
##
## Number of Fisher Scoring iterations: 9
Predicting using the new model:
predict_model1 <- predict(logitModel1, newdata = crime.test.df, type = "response")
predict_model1_class <- ifelse(predict_model1 > 0.5, 1, 0)
xtabs(~predict_model1_class + crime.test.df$target)
## crime.test.df$target
## predict_model1_class 0 1
## 0 59 8
## 1 4 69
predict_model1_train <- predict(logitModel1, newdata = crime.train.df, type = "response")
predict_model1_train_class <- ifelse(predict_model1_train > 0.5, 1, 0)
xtabs(~predict_model1_train_class + crime.train.df$target)
## crime.train.df$target
## predict_model1_train_class 0 1
## 0 165 19
## 1 9 133
roc(crime.test.df$target, predict_model1, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = crime.test.df$target, predictor = predict_model1, plot = TRUE)
##
## Data: predict_model1 in 63 controls (crime.test.df$target 0) < 77 cases (crime.test.df$target 1).
## Area under the curve: 0.9738
So, even if the predictions are quite good, and the ROC curve is also impressive, but many variables are not statistically significant, so we will remove those variables and see how the model behaves. We are removing the following variables: zn indus chas rm age lstat
logitModel2 <- glm(target~nox + dis + rad + tax + ptratio + medv,
data = crime.train.df,
family = binomial(link = "logit"))
summary(logitModel2)
##
## Call:
## glm(formula = target ~ nox + dis + rad + tax + ptratio + medv,
## family = binomial(link = "logit"), data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.91968 -0.35659 -0.05808 0.00390 2.93587
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.728906 6.369583 -5.609 2.03e-08 ***
## nox 45.909956 7.859476 5.841 5.18e-09 ***
## dis 0.384532 0.183818 2.092 0.036446 *
## rad 0.617050 0.159935 3.858 0.000114 ***
## tax -0.005390 0.002877 -1.874 0.060983 .
## ptratio 0.307081 0.121067 2.536 0.011198 *
## medv 0.097944 0.037484 2.613 0.008977 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 151.10 on 319 degrees of freedom
## AIC: 165.1
##
## Number of Fisher Scoring iterations: 9
Predicting using the new model
predict_model2 <- predict(logitModel2, newdata = crime.test.df, type = "response")
predict_model2_class <- ifelse(predict_model2 > 0.5, 1, 0)
print(xtabs(~predict_model2_class + crime.test.df$target))
## crime.test.df$target
## predict_model2_class 0 1
## 0 60 13
## 1 3 64
predict_model2_train <- predict(logitModel2, newdata = crime.train.df, type = "response")
predict_model2_train_class <- ifelse(predict_model2_train > 0.5, 1, 0)
print(xtabs(~predict_model2_train_class + crime.train.df$target))
## crime.train.df$target
## predict_model2_train_class 0 1
## 0 165 25
## 1 9 127
ROC curve:
roc(crime.test.df$target, predict_model2, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = crime.test.df$target, predictor = predict_model2, plot = TRUE)
##
## Data: predict_model2 in 63 controls (crime.test.df$target 0) < 77 cases (crime.test.df$target 1).
## Area under the curve: 0.9703
logitModel3 <- glm(target~nox + rad + tax + ptratio + medv,
data = crime.train.df,
family = binomial(link = "logit"))
summary(logitModel3)
##
## Call:
## glm(formula = target ~ nox + rad + tax + ptratio + medv, family = binomial(link = "logit"),
## data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.91362 -0.34475 -0.05283 0.00412 2.79125
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.127913 4.837362 -5.815 6.07e-09 ***
## nox 36.085713 5.640376 6.398 1.58e-10 ***
## rad 0.637982 0.158309 4.030 5.58e-05 ***
## tax -0.005790 0.002852 -2.030 0.0423 *
## ptratio 0.280625 0.121736 2.305 0.0212 *
## medv 0.078989 0.035496 2.225 0.0261 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 155.42 on 320 degrees of freedom
## AIC: 167.42
##
## Number of Fisher Scoring iterations: 8
Predicting using the new model
predict_model3 <- predict(logitModel3, newdata = crime.test.df, type = "response")
predict_model3_class <- ifelse(predict_model3 > 0.5, 1, 0)
table(predict_model3_class, crime.test.df$target)
##
## predict_model3_class 0 1
## 0 62 13
## 1 1 64
xtabs(~predict_model3_class + crime.test.df$target)
## crime.test.df$target
## predict_model3_class 0 1
## 0 62 13
## 1 1 64
predict_model3_train <- predict(logitModel3, newdata = crime.train.df, type = "response")
predict_model3_train_class <- ifelse(predict_model3_train > 0.5, 1, 0)
xtabs(~predict_model3_train_class + crime.train.df$target)
## crime.train.df$target
## predict_model3_train_class 0 1
## 0 164 24
## 1 10 128
ROC curve:
roc(crime.test.df$target, predict_model3, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = crime.test.df$target, predictor = predict_model3, plot = TRUE)
##
## Data: predict_model3 in 63 controls (crime.test.df$target 0) < 77 cases (crime.test.df$target 1).
## Area under the curve: 0.9695
logitModel4 <- glm(target~nox + rad + tax + ptratio,
data = crime.train.df,
family = binomial(link = "logit"))
summary(logitModel4)
##
## Call:
## glm(formula = target ~ nox + rad + tax + ptratio, family = binomial(link = "logit"),
## data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.89290 -0.35436 -0.04510 0.00307 2.69601
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -23.147709 4.081758 -5.671 1.42e-08 ***
## nox 35.578830 5.566376 6.392 1.64e-10 ***
## rad 0.694694 0.160269 4.335 1.46e-05 ***
## tax -0.007481 0.002760 -2.710 0.00672 **
## ptratio 0.140641 0.099360 1.415 0.15693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 161.10 on 321 degrees of freedom
## AIC: 171.1
##
## Number of Fisher Scoring iterations: 8
predict_model4 <- predict(logitModel4, newdata = crime.test.df, type = "response")
predict_model4_class <- ifelse(predict_model4 > 0.5, 1, 0)
xtabs(~predict_model4_class + crime.test.df$target)
## crime.test.df$target
## predict_model4_class 0 1
## 0 62 13
## 1 1 64
predict_model4_train <- predict(logitModel4, newdata = crime.train.df, type = "response")
predict_model4_train_class <- ifelse(predict_model4_train > 0.5, 1, 0)
xtabs(~predict_model4_train_class + crime.train.df$target)
## crime.train.df$target
## predict_model4_train_class 0 1
## 0 160 24
## 1 14 128
ROC curve:
roc(crime.test.df$target, predict_model4, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = crime.test.df$target, predictor = predict_model4, plot = TRUE)
##
## Data: predict_model4 in 63 controls (crime.test.df$target 0) < 77 cases (crime.test.df$target 1).
## Area under the curve: 0.9746
logitModel5 <- glm(target~nox + rad + tax,
data = crime.train.df,
family = binomial(link = "logit"))
summary(logitModel5)
##
## Call:
## glm(formula = target ~ nox + rad + tax, family = binomial(link = "logit"),
## data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.79655 -0.34773 -0.06110 0.00649 2.63274
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.442575 2.759880 -7.045 1.86e-12 ***
## nox 33.650124 5.102152 6.595 4.24e-11 ***
## rad 0.616817 0.144281 4.275 1.91e-05 ***
## tax -0.006527 0.002644 -2.469 0.0136 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 163.12 on 322 degrees of freedom
## AIC: 171.12
##
## Number of Fisher Scoring iterations: 8
predict_model5 <- predict(logitModel5, newdata = crime.test.df, type = "response")
predict_model5_class <- ifelse(predict_model5 > 0.5, 1, 0)
xtabs(~predict_model5_class + crime.test.df$target)
## crime.test.df$target
## predict_model5_class 0 1
## 0 62 13
## 1 1 64
predict_model5_train <- predict(logitModel5, newdata = crime.train.df, type = "response")
predict_model5_train_class <- ifelse(predict_model5_train > 0.5, 1, 0)
xtabs(~predict_model5_train_class + crime.train.df$target)
## crime.train.df$target
## predict_model5_train_class 0 1
## 0 160 24
## 1 14 128
roc(crime.test.df$target, predict_model5, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = crime.test.df$target, predictor = predict_model5, plot = TRUE)
##
## Data: predict_model5 in 63 controls (crime.test.df$target 0) < 77 cases (crime.test.df$target 1).
## Area under the curve: 0.973
As all the models we built have around the same value of AUC, we will decide to go with model2 as it has the lowest AIC.
summary(logitModel2)
##
## Call:
## glm(formula = target ~ nox + dis + rad + tax + ptratio + medv,
## family = binomial(link = "logit"), data = crime.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.91968 -0.35659 -0.05808 0.00390 2.93587
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.728906 6.369583 -5.609 2.03e-08 ***
## nox 45.909956 7.859476 5.841 5.18e-09 ***
## dis 0.384532 0.183818 2.092 0.036446 *
## rad 0.617050 0.159935 3.858 0.000114 ***
## tax -0.005390 0.002877 -1.874 0.060983 .
## ptratio 0.307081 0.121067 2.536 0.011198 *
## medv 0.097944 0.037484 2.613 0.008977 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.45 on 325 degrees of freedom
## Residual deviance: 151.10 on 319 degrees of freedom
## AIC: 165.1
##
## Number of Fisher Scoring iterations: 9
We will now load the evaluation dataset, and predict the data:
crime_evaluation_ds <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-3/Data/crime-evaluation-data_modified.csv",
header = TRUE)
crime_evaluation_ds$pred_prob <- predict(logitModel2, newdata = crime_evaluation_ds, type = "response" )
crime_evaluation_ds$pred_class <- ifelse(crime_evaluation_ds$pred_prob > 0.5, 1, 0)
table(crime_evaluation_ds$pred_class)
##
## 0 1
## 23 17
print(crime_evaluation_ds)
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.096 84.5 4.4619 4 307 21.0 10.26 18.2
## 3 0 8.14 0 0.538 6.495 94.4 4.4547 4 307 21.0 12.80 18.4
## 4 0 8.14 0 0.538 5.950 82.0 3.9900 4 307 21.0 27.71 13.2
## 5 0 5.96 0 0.499 5.850 41.5 3.9342 5 279 19.2 8.77 21.0
## 6 25 5.13 0 0.453 5.741 66.2 7.2254 8 284 19.7 13.15 18.7
## 7 25 5.13 0 0.453 5.966 93.4 6.8185 8 284 19.7 14.44 16.0
## 8 0 4.49 0 0.449 6.630 56.1 4.4377 3 247 18.5 6.53 26.6
## 9 0 4.49 0 0.449 6.121 56.8 3.7476 3 247 18.5 8.44 22.2
## 10 0 2.89 0 0.445 6.163 69.6 3.4952 2 276 18.0 11.34 21.4
## 11 0 25.65 0 0.581 5.856 97.0 1.9444 2 188 19.1 25.41 17.3
## 12 0 25.65 0 0.581 5.613 95.6 1.7572 2 188 19.1 27.26 15.7
## 13 0 21.89 0 0.624 5.637 94.7 1.9799 4 437 21.2 18.34 14.3
## 14 0 19.58 0 0.605 6.101 93.0 2.2834 5 403 14.7 9.81 25.0
## 15 0 19.58 0 0.605 5.880 97.3 2.3887 5 403 14.7 12.03 19.1
## 16 0 10.59 1 0.489 5.960 92.1 3.8771 4 277 18.6 17.27 21.7
## 17 0 6.20 0 0.504 6.552 21.4 3.3751 8 307 17.4 3.76 31.5
## 18 0 6.20 0 0.507 8.247 70.4 3.6519 8 307 17.4 3.95 48.3
## 19 22 5.86 0 0.431 6.957 6.8 8.9067 7 330 19.1 3.53 29.6
## 20 90 2.97 0 0.400 7.088 20.8 7.3073 1 285 15.3 7.85 32.2
## 21 80 1.76 0 0.385 6.230 31.5 9.0892 1 241 18.2 12.93 20.1
## 22 33 2.18 0 0.472 6.616 58.1 3.3700 7 222 18.4 8.93 28.4
## 23 0 9.90 0 0.544 6.122 52.8 2.6403 4 304 18.4 5.98 22.1
## 24 0 7.38 0 0.493 6.415 40.1 4.7211 5 287 19.6 6.12 25.0
## 25 0 7.38 0 0.493 6.312 28.9 5.4159 5 287 19.6 6.15 23.0
## 26 0 5.19 0 0.515 5.895 59.6 5.6150 5 224 20.2 10.56 18.5
## 27 80 2.01 0 0.435 6.635 29.7 8.3440 4 280 17.0 5.99 24.5
## 28 0 18.10 0 0.718 3.561 87.9 1.6132 24 666 20.2 7.12 27.5
## 29 0 18.10 1 0.631 7.016 97.5 1.2024 24 666 20.2 2.96 50.0
## 30 0 18.10 0 0.584 6.348 86.1 2.0527 24 666 20.2 17.64 14.5
## 31 0 18.10 0 0.740 5.935 87.9 1.8206 24 666 20.2 34.02 8.4
## 32 0 18.10 0 0.740 5.627 93.9 1.8172 24 666 20.2 22.88 12.8
## 33 0 18.10 0 0.740 5.818 92.4 1.8662 24 666 20.2 22.11 10.5
## 34 0 18.10 0 0.740 6.219 100.0 2.0048 24 666 20.2 16.59 18.4
## 35 0 18.10 0 0.740 5.854 96.6 1.8956 24 666 20.2 23.79 10.8
## 36 0 18.10 0 0.713 6.525 86.5 2.4358 24 666 20.2 18.13 14.1
## 37 0 18.10 0 0.713 6.376 88.4 2.5671 24 666 20.2 14.65 17.7
## 38 0 18.10 0 0.655 6.209 65.4 2.9634 24 666 20.2 13.22 21.4
## 39 0 9.69 0 0.585 5.794 70.6 2.8927 6 391 19.2 14.10 18.3
## 40 0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 5.64 23.9
## pred_prob pred_class
## 1 0.0295188621 0
## 2 0.4331749343 0
## 3 0.4373093911 0
## 4 0.2808827568 0
## 5 0.1451889623 0
## 6 0.2959493870 0
## 7 0.2162635232 0
## 8 0.0099240675 0
## 9 0.0049710763 0
## 10 0.0013788968 0
## 11 0.3711904838 0
## 12 0.3195573183 0
## 13 0.8460114100 1
## 14 0.6900500124 1
## 15 0.5653681074 1
## 16 0.0485284815 0
## 17 0.6029088158 1
## 18 0.9094731671 1
## 19 0.2293773199 0
## 20 0.0004892936 0
## 21 0.0004604139 0
## 22 0.2298941619 0
## 23 0.2508332220 0
## 24 0.2185338360 0
## 25 0.2309516435 0
## 26 0.4916801477 0
## 27 0.0185130753 0
## 28 0.9999999843 1
## 29 0.9999998899 1
## 30 0.9999777781 1
## 31 0.9999999658 1
## 32 0.9999999777 1
## 33 0.9999999726 1
## 34 0.9999999880 1
## 35 0.9999999737 1
## 36 0.9999999466 1
## 37 0.9999999643 1
## 38 0.9999996941 1
## 39 0.8211011791 1
## 40 0.3421029917 0