Question 2
# URL of the CSV file
url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/HouseVotes-1984-ButOne.csv"
# Read the CSV file into a data frame
df <- read.csv(url, stringsAsFactors = FALSE, header = T)
# Show the first five rows of df
head(df, 5)
Question 3
# Empty lists for training and validation data
training_data <- list()
validation_data <- list()
set.seed(100)
# 5 random samples
for (i in 1:5) {
# Sample 350 rows
train_indices <- sample(nrow(df), 350)
# Training data
training_data[[i]] <- df[train_indices, ]
# Validation data
validation_data[[i]] <- df[-train_indices, ]
}
Question 4
# Empty list to store the models
nb_models <- list()
# Train Naive Bayes models
nb_models <- lapply(training_data, function(train_set) {
# Convert the party column to a factor
train_set$party <- as.factor(train_set$Class)
# Train the Naive Bayes model
model <- naiveBayes(Class ~ ., data = train_set)
return(model)
})
Question 5
# Ensemble function
predict_party <- function(models, new_cases) {
# Vector to store predictions
predictions <- vector("list", length(models))
# Make predictions
for (i in seq_along(models)) {
predictions[[i]] <- predict(models[[i]], new_cases)
}
# Determine the majority vote
final_predictions <- apply(do.call(cbind, predictions), 1, function(x) {
names(sort(table(x), decreasing = TRUE))[1]
})
return(final_predictions)
}
Question 6
# Vectors to store results
accuracies <- numeric(5)
tprs <- numeric(5)
tnrs <- numeric(5)
# Pull metrics for each validation dataset
for (i in 1:5) {
# Get the validation set
val_set <- validation_data[[i]]
# Make predictions
predictions <- predict_party(nb_models, val_set)
# Confusion matrix
matrix <- table(Actual = val_set$Class, Predicted = predictions)
# Calculate metrics
accuracies[i] <- sum(diag(matrix)) / sum(matrix)
tprs[i] <- matrix[2, 2] / sum(matrix[2,])
tnrs[i] <- matrix[1, 1] / sum(matrix[1,])
}
# Mean performance across all folds
mean_performance <- c(
Accuracy = mean(accuracies),
TruePositiveRate = mean(tprs),
TrueNegativeRate = mean(tnrs)
)
# Result table
result_table <- data.frame(
Metric = names(mean_performance),
Value = mean_performance
)
row.names(result_table) <- NULL
print(result_table)
## Metric Value
## 1 Accuracy 0.8928571
## 2 TruePositiveRate 0.9038510
## 3 TrueNegativeRate 0.8851244
Question 7
# Member voting record
new_member_votes <- data.frame(
V1 = "n",
V2 = "y",
V3 = "n",
V4 = "y",
V5 = "y",
V6 = "y",
V7 = "n",
V8 = "n",
V9 = "n",
V10 = "y",
V11 = "n",
V12 = "y",
V13 = "y",
V14 = "y",
V15 = NA,
V16 = "n",
stringsAsFactors = FALSE
)
# Prediction using ensemble
predicted_party <- predict_party(nb_models, new_member_votes)
# Add party name
party_name <- ifelse(predicted_party == 1, "democrat",
ifelse(predicted_party == 2, "republican", NA))
print(party_name)
## [1] "republican"