library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
options(warn=-1)
set.seed(42)
data <- read.csv("https://raw.githubusercontent.com/KevinJpotter/data_621/main/crime-training-data_modified.csv?token=AK4GIZFNBXCUEC2JBODWSJTAPWMFG")
df <- data.frame(data)
trainIndex <- createDataPartition(df$target, p = .7,
list = FALSE,
times = 1)
train <- df[ trainIndex,]
X_train <- scale(subset(train, select=-target))
test <- df[-trainIndex,]
X_test <- scale(subset(test, select=-target))
summary(df)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4914
## 3rd Qu.:1.0000
## Max. :1.0000
sapply(df, function(x) sum(is.na(x)))
## zn indus chas nox rm age dis rad tax ptratio
## 0 0 0 0 0 0 0 0 0 0
## lstat medv target
## 0 0 0
boxplot(subset(X_train, select=-chas), fill='target', col ='blue', las=2, horizontal=TRUE)

pairs.panels(X_train,
method = "pearson", # correlation method
hist.col = "#00AFBB",
)

GLM
mylogit <- glm(target~., data = train, family = "binomial")
preds <- round(as.numeric(predict(mylogit, newdata = data.frame(X_test), type = "response")), digits=0)
confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)),
factor(test$target, levels=min(test$target):max(test$target))),
positive ='1')
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 62 45
## 1 0 32
##
## Accuracy : 0.6763
## 95% CI : (0.5917, 0.7531)
## No Information Rate : 0.554
## P-Value [Acc > NIR] : 0.002187
##
## Kappa : 0.3881
##
## Mcnemar's Test P-Value : 5.412e-11
##
## Sensitivity : 0.4156
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.5794
## Prevalence : 0.5540
## Detection Rate : 0.2302
## Detection Prevalence : 0.2302
## Balanced Accuracy : 0.7078
##
## 'Positive' Class : 1
##
GLMNET w/ Lasso FS
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
# Train the model
cvfit = cv.glmnet(x=X_train, train$target, alpha=1, family="binomial", type.measure ='auc')
plot(cvfit)

preds <- as.numeric(predict(cvfit, newx = X_test, s = "lambda.min", type = "class"))
confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)),
factor(test$target, levels=min(test$target):max(test$target))),
positive ='1')
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 59 13
## 1 3 64
##
## Accuracy : 0.8849
## 95% CI : (0.8198, 0.9328)
## No Information Rate : 0.554
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7707
##
## Mcnemar's Test P-Value : 0.02445
##
## Sensitivity : 0.8312
## Specificity : 0.9516
## Pos Pred Value : 0.9552
## Neg Pred Value : 0.8194
## Prevalence : 0.5540
## Detection Rate : 0.4604
## Detection Prevalence : 0.4820
## Balanced Accuracy : 0.8914
##
## 'Positive' Class : 1
##
RFE using RF
control <- rfeControl(functions = rfFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
# Setting the cross validation parameters
ctrl_param <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE,
number = 5)
#
result_rfe1 <- rfe(x = X_train,
y = train$target,
sizes = c(1:13),
rfeControl = control)
varimp_data <- data.frame(feature = row.names(varImp(result_rfe1))[1:8],
importance = varImp(result_rfe1)[1:8, 1])
ggplot(data = varimp_data,
aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") +
geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) +
theme_bw() + theme(legend.position = "none")

mylogit <- glm(target~ nox + rad + tax + indus + dis + ptratio + age + rm, data = train, family = "binomial")
preds <- round(as.numeric(predict(mylogit, newdata = data.frame(X_test), type = "response")), digits=0)
confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)),
factor(test$target, levels=min(test$target):max(test$target))), positive ='1')
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 62 45
## 1 0 32
##
## Accuracy : 0.6763
## 95% CI : (0.5917, 0.7531)
## No Information Rate : 0.554
## P-Value [Acc > NIR] : 0.002187
##
## Kappa : 0.3881
##
## Mcnemar's Test P-Value : 5.412e-11
##
## Sensitivity : 0.4156
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.5794
## Prevalence : 0.5540
## Detection Rate : 0.2302
## Detection Prevalence : 0.2302
## Balanced Accuracy : 0.7078
##
## 'Positive' Class : 1
##