###
###  United States Congressional Voting Records 1984
###

#rm(list=ls())
library(mdsr)
library(dplyr)
library(mosaic)
library(tidyverse)
###install.packages("Rcpp")

##Exploring and preparing the data
library(e1071) 
data(HouseVotes84, package = "mlbench")

#Set training data set and test data set
# I set the first 75% of 435 observatiosn as training, the rest is test
hv_train<-HouseVotes84[1:326,-1]
hv_test<-HouseVotes84[327:435,-1]

# Save labels
hv_train_labels <- HouseVotes84[1:326, ]$Class 
hv_test_labels<- HouseVotes84[327:435, ]$Class

hv_classifier <- naiveBayes(hv_train, hv_train_labels)

hv_test_pred <- predict(hv_classifier, hv_test)
head(hv_test_pred)
## [1] democrat   republican democrat   democrat   republican democrat  
## Levels: democrat republican
library(gmodels)
CrossTable(hv_test_pred, hv_test_labels,
         prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  109 
## 
##  
##              | actual 
##    predicted |   democrat | republican |  Row Total | 
## -------------|------------|------------|------------|
##     democrat |         55 |          3 |         58 | 
##              |      0.833 |      0.070 |            | 
## -------------|------------|------------|------------|
##   republican |         11 |         40 |         51 | 
##              |      0.167 |      0.930 |            | 
## -------------|------------|------------|------------|
## Column Total |         66 |         43 |        109 | 
##              |      0.606 |      0.394 |            | 
## -------------|------------|------------|------------|
## 
## 
hv_test_pred
##   [1] democrat   republican democrat   democrat   republican democrat  
##   [7] democrat   democrat   democrat   republican democrat   democrat  
##  [13] democrat   republican republican democrat   democrat   republican
##  [19] democrat   republican republican republican democrat   republican
##  [25] democrat   republican democrat   republican democrat   democrat  
##  [31] republican republican democrat   republican democrat   democrat  
##  [37] democrat   republican republican republican democrat   democrat  
##  [43] democrat   republican democrat   democrat   republican republican
##  [49] republican republican democrat   republican republican republican
##  [55] democrat   democrat   republican democrat   republican republican
##  [61] democrat   democrat   republican democrat   republican democrat  
##  [67] republican democrat   democrat   democrat   democrat   republican
##  [73] democrat   republican republican republican democrat   republican
##  [79] republican republican democrat   republican democrat   republican
##  [85] republican democrat   republican republican democrat   democrat  
##  [91] republican democrat   democrat   democrat   republican democrat  
##  [97] democrat   democrat   democrat   democrat   democrat   republican
## [103] democrat   democrat   republican democrat   republican republican
## [109] republican
## Levels: democrat republican
##Accuracy= (55+40)/109=87%

hv_classifier2 <- naiveBayes(hv_train, hv_train_labels, laplace = 3)
hv_test_pred2 <- predict(hv_classifier2, hv_test)
CrossTable(hv_test_pred2, hv_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  109 
## 
##  
##              | actual 
##    predicted |   democrat | republican |  Row Total | 
## -------------|------------|------------|------------|
##     democrat |         55 |          3 |         58 | 
##              |      0.833 |      0.070 |            | 
## -------------|------------|------------|------------|
##   republican |         11 |         40 |         51 | 
##              |      0.167 |      0.930 |            | 
## -------------|------------|------------|------------|
## Column Total |         66 |         43 |        109 | 
##              |      0.606 |      0.394 |            | 
## -------------|------------|------------|------------|
## 
## 
hv_test_pred2
##   [1] democrat   republican democrat   democrat   republican democrat  
##   [7] democrat   democrat   democrat   republican democrat   democrat  
##  [13] democrat   republican republican democrat   democrat   republican
##  [19] democrat   republican republican republican democrat   republican
##  [25] democrat   republican democrat   republican democrat   democrat  
##  [31] republican republican democrat   republican democrat   democrat  
##  [37] democrat   republican republican republican democrat   democrat  
##  [43] democrat   republican democrat   democrat   republican republican
##  [49] republican republican democrat   republican republican republican
##  [55] democrat   democrat   republican democrat   republican republican
##  [61] democrat   democrat   republican democrat   republican democrat  
##  [67] republican democrat   democrat   democrat   democrat   republican
##  [73] democrat   republican republican republican democrat   republican
##  [79] republican republican democrat   republican democrat   republican
##  [85] republican democrat   republican republican democrat   democrat  
##  [91] republican democrat   democrat   democrat   republican democrat  
##  [97] democrat   democrat   democrat   democrat   democrat   republican
## [103] democrat   democrat   republican democrat   republican republican
## [109] republican
## Levels: democrat republican
##Conclusion: The accuracy of our model prediction is 87%. Accuracy=(55+40)/109=87%
library(mlbench)
library(randomForest)
library(caret)
library(dplyr)

set.seed(123)

HouseVotes84[HouseVotes84 == "?"] <- NA
HouseVotes84 <- na.omit(HouseVotes84)

##Removed missing values

##This dataset contains voting records represented as categorical variables. Missing values are ##coded as "?". Since Random Forest in this workflow does not handle "?" directly, we first convert ##"?" to NA and remove incomplete rows using na.omit().


trainIndex <- createDataPartition(HouseVotes84$Class,
                                   p = 0.75,
                                   list = FALSE)

##Train/Test Split (Avoiding Data Leakage)

##We split the dataset into 75% training and 25% testing using createDataPartition() to ensure the ##class proportions are preserved (stratified sampling). This random split avoids bias that can ##occur if we simply take the “first 75%” of rows as training data.

trainData <- HouseVotes84[trainIndex, ]
testData  <- HouseVotes84[-trainIndex, ]

control <- trainControl(method="cv", number=10)

##Model Training with Cross-Validation (Overfitting Control)

##To reduce the risk of overfitting, we use 10-fold cross-validation (trainControl(method="cv", ##number=10)). Cross-validation repeatedly trains the model on different subsets of the training ##data and evaluates it on the remaining folds.

tunegrid <- expand.grid(.mtry=c(2,4,6,8))

rf_model <- train(Class ~ ., 
                  data=trainData,
                  method="rf",
                  trControl=control,
                  tuneGrid=tunegrid,
                  ntree=500)
rf_model
## Random Forest 
## 
## 174 samples
##  16 predictor
##   2 classes: 'democrat', 'republican' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 155, 157, 157, 157, 156, 157, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9424493  0.8841568
##   4     0.9715342  0.9428913
##   6     0.9715342  0.9428913
##   8     0.9715342  0.9428913
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
pred <- predict(rf_model, testData)
confusionMatrix(pred, testData$Class)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   democrat republican
##   democrat         28          0
##   republican        3         27
##                                           
##                Accuracy : 0.9483          
##                  95% CI : (0.8562, 0.9892)
##     No Information Rate : 0.5345          
##     P-Value [Acc > NIR] : 3.601e-12       
##                                           
##                   Kappa : 0.8968          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.9032          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.5345          
##          Detection Rate : 0.4828          
##    Detection Prevalence : 0.4828          
##       Balanced Accuracy : 0.9516          
##                                           
##        'Positive' Class : democrat        
## 
##Conclusion: The overall test accuracy is 94.83%, and the balanced accuracy is 95.16%.