###
### United States Congressional Voting Records 1984
###
#rm(list=ls())
library(mdsr)
library(dplyr)
library(mosaic)
library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(dplyr)
###install.packages("Rcpp")
##Exploring and preparing the data
library(e1071)
data(HouseVotes84, package = "mlbench")
set.seed(123)
HouseVotes84[HouseVotes84 == "?"] <- NA
HouseVotes84 <- na.omit(HouseVotes84)
##Removed missing values
##This dataset contains voting records represented as categorical variables. Missing values are ##coded as "?". Since Random Forest in this workflow does not handle "?" directly, we first convert ##"?" to NA and remove incomplete rows using na.omit().
trainIndex <- createDataPartition(HouseVotes84$Class,
p = 0.75,
list = FALSE)
##Train/Test Split (Avoiding Data Leakage)
##We split the dataset into 75% training and 25% testing using createDataPartition() to ensure the ##class proportions are preserved (stratified sampling). This random split avoids bias that can ##occur if we simply take the “first 75%” of rows as training data.
trainData <- HouseVotes84[trainIndex, ]
testData <- HouseVotes84[-trainIndex, ]
control <- trainControl(method="cv", number=10)
##Model Training with Cross-Validation (Overfitting Control)
##To reduce the risk of overfitting, we use 10-fold cross-validation (trainControl(method="cv", ##number=10)). Cross-validation repeatedly trains the model on different subsets of the training ##data and evaluates it on the remaining folds.
tunegrid <- expand.grid(.mtry=c(2,4,6,8))
rf_model <- train(Class ~ .,
data=trainData,
method="rf",
trControl=control,
tuneGrid=tunegrid,
ntree=500)
rf_model
## Random Forest
##
## 174 samples
## 16 predictor
## 2 classes: 'democrat', 'republican'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 155, 157, 157, 157, 156, 157, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9424493 0.8841568
## 4 0.9715342 0.9428913
## 6 0.9715342 0.9428913
## 8 0.9715342 0.9428913
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
pred <- predict(rf_model, testData)
confusionMatrix(pred, testData$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction democrat republican
## democrat 28 0
## republican 3 27
##
## Accuracy : 0.9483
## 95% CI : (0.8562, 0.9892)
## No Information Rate : 0.5345
## P-Value [Acc > NIR] : 3.601e-12
##
## Kappa : 0.8968
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.9032
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9000
## Prevalence : 0.5345
## Detection Rate : 0.4828
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.9516
##
## 'Positive' Class : democrat
##
##Conclusion: The overall test accuracy is 94.83%, and the balanced accuracy is 95.16%.