###
### United States Congressional Voting Records 1984
###
#rm(list=ls())
library(mdsr)
library(dplyr)
library(mosaic)
library(tidyverse)
###install.packages("Rcpp")
##Exploring and preparing the data
library(e1071)
data(HouseVotes84, package = "mlbench")
#Set training data set and test data set
# I set the first 75% of 435 observatiosn as training, the rest is test
hv_train<-HouseVotes84[1:326,-1]
hv_test<-HouseVotes84[327:435,-1]
# Save labels
hv_train_labels <- HouseVotes84[1:326, ]$Class
hv_test_labels<- HouseVotes84[327:435, ]$Class
hv_classifier <- naiveBayes(hv_train, hv_train_labels)
hv_test_pred <- predict(hv_classifier, hv_test)
head(hv_test_pred)
## [1] democrat republican democrat democrat republican democrat
## Levels: democrat republican
library(gmodels)
CrossTable(hv_test_pred, hv_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 109
##
##
## | actual
## predicted | democrat | republican | Row Total |
## -------------|------------|------------|------------|
## democrat | 55 | 3 | 58 |
## | 0.833 | 0.070 | |
## -------------|------------|------------|------------|
## republican | 11 | 40 | 51 |
## | 0.167 | 0.930 | |
## -------------|------------|------------|------------|
## Column Total | 66 | 43 | 109 |
## | 0.606 | 0.394 | |
## -------------|------------|------------|------------|
##
##
hv_test_pred
## [1] democrat republican democrat democrat republican democrat
## [7] democrat democrat democrat republican democrat democrat
## [13] democrat republican republican democrat democrat republican
## [19] democrat republican republican republican democrat republican
## [25] democrat republican democrat republican democrat democrat
## [31] republican republican democrat republican democrat democrat
## [37] democrat republican republican republican democrat democrat
## [43] democrat republican democrat democrat republican republican
## [49] republican republican democrat republican republican republican
## [55] democrat democrat republican democrat republican republican
## [61] democrat democrat republican democrat republican democrat
## [67] republican democrat democrat democrat democrat republican
## [73] democrat republican republican republican democrat republican
## [79] republican republican democrat republican democrat republican
## [85] republican democrat republican republican democrat democrat
## [91] republican democrat democrat democrat republican democrat
## [97] democrat democrat democrat democrat democrat republican
## [103] democrat democrat republican democrat republican republican
## [109] republican
## Levels: democrat republican
##Accuracy= (55+40)/109=87%
hv_classifier2 <- naiveBayes(hv_train, hv_train_labels, laplace = 3)
hv_test_pred2 <- predict(hv_classifier2, hv_test)
CrossTable(hv_test_pred2, hv_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 109
##
##
## | actual
## predicted | democrat | republican | Row Total |
## -------------|------------|------------|------------|
## democrat | 55 | 3 | 58 |
## | 0.833 | 0.070 | |
## -------------|------------|------------|------------|
## republican | 11 | 40 | 51 |
## | 0.167 | 0.930 | |
## -------------|------------|------------|------------|
## Column Total | 66 | 43 | 109 |
## | 0.606 | 0.394 | |
## -------------|------------|------------|------------|
##
##
hv_test_pred2
## [1] democrat republican democrat democrat republican democrat
## [7] democrat democrat democrat republican democrat democrat
## [13] democrat republican republican democrat democrat republican
## [19] democrat republican republican republican democrat republican
## [25] democrat republican democrat republican democrat democrat
## [31] republican republican democrat republican democrat democrat
## [37] democrat republican republican republican democrat democrat
## [43] democrat republican democrat democrat republican republican
## [49] republican republican democrat republican republican republican
## [55] democrat democrat republican democrat republican republican
## [61] democrat democrat republican democrat republican democrat
## [67] republican democrat democrat democrat democrat republican
## [73] democrat republican republican republican democrat republican
## [79] republican republican democrat republican democrat republican
## [85] republican democrat republican republican democrat democrat
## [91] republican democrat democrat democrat republican democrat
## [97] democrat democrat democrat democrat democrat republican
## [103] democrat democrat republican democrat republican republican
## [109] republican
## Levels: democrat republican
##Conclusion: The accuracy of our model prediction is 87%. Accuracy=(55+40)/109=87%
library(mlbench)
library(randomForest)
library(caret)
library(dplyr)
set.seed(123)
HouseVotes84[HouseVotes84 == "?"] <- NA
HouseVotes84 <- na.omit(HouseVotes84)
##Removed missing values
##This dataset contains voting records represented as categorical variables. Missing values are ##coded as "?". Since Random Forest in this workflow does not handle "?" directly, we first convert ##"?" to NA and remove incomplete rows using na.omit().
trainIndex <- createDataPartition(HouseVotes84$Class,
p = 0.75,
list = FALSE)
##Train/Test Split (Avoiding Data Leakage)
##We split the dataset into 75% training and 25% testing using createDataPartition() to ensure the ##class proportions are preserved (stratified sampling). This random split avoids bias that can ##occur if we simply take the “first 75%” of rows as training data.
trainData <- HouseVotes84[trainIndex, ]
testData <- HouseVotes84[-trainIndex, ]
control <- trainControl(method="cv", number=10)
##Model Training with Cross-Validation (Overfitting Control)
##To reduce the risk of overfitting, we use 10-fold cross-validation (trainControl(method="cv", ##number=10)). Cross-validation repeatedly trains the model on different subsets of the training ##data and evaluates it on the remaining folds.
tunegrid <- expand.grid(.mtry=c(2,4,6,8))
rf_model <- train(Class ~ .,
data=trainData,
method="rf",
trControl=control,
tuneGrid=tunegrid,
ntree=500)
rf_model
## Random Forest
##
## 174 samples
## 16 predictor
## 2 classes: 'democrat', 'republican'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 155, 157, 157, 157, 156, 157, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9424493 0.8841568
## 4 0.9715342 0.9428913
## 6 0.9715342 0.9428913
## 8 0.9715342 0.9428913
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
pred <- predict(rf_model, testData)
confusionMatrix(pred, testData$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction democrat republican
## democrat 28 0
## republican 3 27
##
## Accuracy : 0.9483
## 95% CI : (0.8562, 0.9892)
## No Information Rate : 0.5345
## P-Value [Acc > NIR] : 3.601e-12
##
## Kappa : 0.8968
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.9032
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9000
## Prevalence : 0.5345
## Detection Rate : 0.4828
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.9516
##
## 'Positive' Class : democrat
##
##Conclusion: The overall test accuracy is 94.83%, and the balanced accuracy is 95.16%.