###
###  United States Congressional Voting Records 1984
###

#rm(list=ls())
library(mdsr)
library(dplyr)
library(mosaic)
library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(dplyr)

###install.packages("Rcpp")

##Exploring and preparing the data
library(e1071) 
data(HouseVotes84, package = "mlbench")

set.seed(123)

HouseVotes84[HouseVotes84 == "?"] <- NA
HouseVotes84 <- na.omit(HouseVotes84)

##Removed missing values

##This dataset contains voting records represented as categorical variables. Missing values are ##coded as "?". Since Random Forest in this workflow does not handle "?" directly, we first convert ##"?" to NA and remove incomplete rows using na.omit().

trainIndex <- createDataPartition(HouseVotes84$Class,
                                   p = 0.75,
                                   list = FALSE)

##Train/Test Split (Avoiding Data Leakage)

##We split the dataset into 75% training and 25% testing using createDataPartition() to ensure the ##class proportions are preserved (stratified sampling). This random split avoids bias that can ##occur if we simply take the “first 75%” of rows as training data.

trainData <- HouseVotes84[trainIndex, ]
testData  <- HouseVotes84[-trainIndex, ]

control <- trainControl(method="cv", number=10)

##Model Training with Cross-Validation (Overfitting Control)

##To reduce the risk of overfitting, we use 10-fold cross-validation (trainControl(method="cv", ##number=10)). Cross-validation repeatedly trains the model on different subsets of the training ##data and evaluates it on the remaining folds.

tunegrid <- expand.grid(.mtry=c(2,4,6,8))

rf_model <- train(Class ~ ., 
                  data=trainData,
                  method="rf",
                  trControl=control,
                  tuneGrid=tunegrid,
                  ntree=500)
rf_model
## Random Forest 
## 
## 174 samples
##  16 predictor
##   2 classes: 'democrat', 'republican' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 155, 157, 157, 157, 156, 157, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9424493  0.8841568
##   4     0.9715342  0.9428913
##   6     0.9715342  0.9428913
##   8     0.9715342  0.9428913
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
pred <- predict(rf_model, testData)
confusionMatrix(pred, testData$Class)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   democrat republican
##   democrat         28          0
##   republican        3         27
##                                           
##                Accuracy : 0.9483          
##                  95% CI : (0.8562, 0.9892)
##     No Information Rate : 0.5345          
##     P-Value [Acc > NIR] : 3.601e-12       
##                                           
##                   Kappa : 0.8968          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.9032          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.5345          
##          Detection Rate : 0.4828          
##    Detection Prevalence : 0.4828          
##       Balanced Accuracy : 0.9516          
##                                           
##        'Positive' Class : democrat        
## 
##Conclusion: The overall test accuracy is 94.83%, and the balanced accuracy is 95.16%.