library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
options(warn=-1)
set.seed(42)


data <- read.csv("https://raw.githubusercontent.com/KevinJpotter/data_621/main/crime-training-data_modified.csv?token=AK4GIZFNBXCUEC2JBODWSJTAPWMFG")
df <- data.frame(data)

trainIndex <- createDataPartition(df$target, p = .7,
                                  list = FALSE,
                                  times = 1)
train <- df[ trainIndex,]
X_train <- scale(subset(train, select=-target))
test <- df[-trainIndex,]
X_test <- scale(subset(test, select=-target))
summary(df)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##      target      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4914  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
sapply(df, function(x) sum(is.na(x)))
##      zn   indus    chas     nox      rm     age     dis     rad     tax ptratio 
##       0       0       0       0       0       0       0       0       0       0 
##   lstat    medv  target 
##       0       0       0
boxplot(subset(X_train, select=-chas), fill='target', col ='blue', las=2, horizontal=TRUE)

pairs.panels(X_train, 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             )

GLM

mylogit <- glm(target~., data = train, family = "binomial")

preds <- round(as.numeric(predict(mylogit, newdata = data.frame(X_test), type = "response")), digits=0)

confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)), 
      factor(test$target, levels=min(test$target):max(test$target))),
       positive ='1')
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 62 45
##   1  0 32
##                                           
##                Accuracy : 0.6763          
##                  95% CI : (0.5917, 0.7531)
##     No Information Rate : 0.554           
##     P-Value [Acc > NIR] : 0.002187        
##                                           
##                   Kappa : 0.3881          
##                                           
##  Mcnemar's Test P-Value : 5.412e-11       
##                                           
##             Sensitivity : 0.4156          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.5794          
##              Prevalence : 0.5540          
##          Detection Rate : 0.2302          
##    Detection Prevalence : 0.2302          
##       Balanced Accuracy : 0.7078          
##                                           
##        'Positive' Class : 1               
## 

GLMNET w/ Lasso FS

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
# Train the model
cvfit = cv.glmnet(x=X_train, train$target, alpha=1,  family="binomial", type.measure ='auc')
plot(cvfit)

preds <- as.numeric(predict(cvfit, newx = X_test, s = "lambda.min", type = "class"))
confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)), 
      factor(test$target, levels=min(test$target):max(test$target))),
      positive ='1')
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 59 13
##   1  3 64
##                                           
##                Accuracy : 0.8849          
##                  95% CI : (0.8198, 0.9328)
##     No Information Rate : 0.554           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7707          
##                                           
##  Mcnemar's Test P-Value : 0.02445         
##                                           
##             Sensitivity : 0.8312          
##             Specificity : 0.9516          
##          Pos Pred Value : 0.9552          
##          Neg Pred Value : 0.8194          
##              Prevalence : 0.5540          
##          Detection Rate : 0.4604          
##    Detection Prevalence : 0.4820          
##       Balanced Accuracy : 0.8914          
##                                           
##        'Positive' Class : 1               
## 

RFE using RF

control <- rfeControl(functions = rfFuncs, # random forest
                      method = "repeatedcv", # repeated cv
                      repeats = 5, # number of repeats
                      number = 10) # number of folds
 
# Setting the cross validation parameters
ctrl_param <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE,
                   number = 5)
 
# 
result_rfe1 <- rfe(x = X_train, 
                   y = train$target, 
                   sizes = c(1:13),
                   rfeControl = control)

varimp_data <- data.frame(feature = row.names(varImp(result_rfe1))[1:8],
                          importance = varImp(result_rfe1)[1:8, 1])

ggplot(data = varimp_data, 
       aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
  geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") + 
  geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) + 
  theme_bw() + theme(legend.position = "none")

mylogit <- glm(target~ nox + rad + tax + indus + dis + ptratio + age + rm, data = train, family = "binomial")

preds <- round(as.numeric(predict(mylogit, newdata = data.frame(X_test), type = "response")), digits=0)

confusionMatrix(table(factor(preds, levels=min(test$target):max(test$target)), 
      factor(test$target, levels=min(test$target):max(test$target))), positive ='1')
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 62 45
##   1  0 32
##                                           
##                Accuracy : 0.6763          
##                  95% CI : (0.5917, 0.7531)
##     No Information Rate : 0.554           
##     P-Value [Acc > NIR] : 0.002187        
##                                           
##                   Kappa : 0.3881          
##                                           
##  Mcnemar's Test P-Value : 5.412e-11       
##                                           
##             Sensitivity : 0.4156          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.5794          
##              Prevalence : 0.5540          
##          Detection Rate : 0.2302          
##    Detection Prevalence : 0.2302          
##       Balanced Accuracy : 0.7078          
##                                           
##        'Positive' Class : 1               
##