#PROJECT 1
# https://archive.ics.uci.edu/ml/datasets/adult
#' Introduction & Instructions
#' Predict whether income exceeds $50K/yr based on census data.
#' Extraction by Barry Becker from the 1994 Census database.
#' Examine the accuracy of two algos with the census data below. The data is
#' census data originally posted on the UCI repository.
#'
#' Use the routine we used in class to generate graphs for the confusion matrix
#' associated with each of the two algos.
#' Deploy the chosen algo to predict whether income exceeds $50K/yr for the 10
#' individuals in the data set new_individuals.
#' Post the results of the prediction in a table and generate a graph.
#' The deliverable is a 4 quadrant dashboard with the two confusion matrix and
#' the predicted results.
#' the remaining quadrant should provide an explanation and succinct summary of the
#' task.
#' Upload to a bucket called Project1_DM_Dash by Wednesday March 29, COB.
#' ====================================== =
# setwd("C:/Users/arodriguez/Dropbox/classes/DataMining_Spr23/Project_1")
options(digits = 3, scipen = 9999)
remove(list = ls())
graphics.off()
pacman::p_load(magrittr, tidyverse, useful, janitor, class, randomForest)
pacman::p_load(yardstick)
#' ====================================== =
data <- read.csv("census.csv")
data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$native.country <- as.factor(data$native.country)
data$income <- as.factor(data$income)
#' ====================================== =
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123) # Setting seed for reproducibility
index <- createDataPartition(data$income, p = 0.7, list = FALSE) # Splitting data into 70% training and 30% validation
train <- data[index, ] # Training dataset
valid <- data[-index, ] # Validation dataset
test <- data %>% sample_n(10) # Randomly sample 10 rows from the data dataframe
#' ====================================== =
# Training the random forest model
rf_model <- randomForest(income ~ ., data = train, ntree = 150, importance = TRUE)
# Predicting the validation dataset
valid_predictions <- predict(rf_model, valid)
# Comparing the predicted values with the actual values
comparison <- valid_predictions == valid$income
# Calculating the accuracy
accuracy <- mean(comparison)
cat("Accuracy:", accuracy * 100, "%\n")
## Accuracy: 86.2 %
# Creating a confusion matrix
confusion_matrix_rf <- table(Predicted = valid_predictions, Actual = valid$income)
print(confusion_matrix_rf)
## Actual
## Predicted <=50K >50K
## <=50K 6928 865
## >50K 487 1487
#' ====================================== =
# Training the decision tree model
library(rpart)
# Training the decision tree model with tuned parameters
tuned_cp <- 0.01 # Replace with your tuned cp value
tuned_minsplit <- 20 # Replace with your tuned minsplit value
dt_model <- rpart(income ~ ., data = train, control = rpart.control(cp = tuned_cp, minsplit = tuned_minsplit))
# Predicting the validation dataset
valid_dt_predictions <- predict(dt_model, valid, type = "class")
# Comparing the predicted values with the actual values
dt_comparison <- valid_dt_predictions == valid$income
# Calculating the accuracy
dt_accuracy <- mean(dt_comparison)
cat("Decision Tree Accuracy:", dt_accuracy * 100, "%\n")
## Decision Tree Accuracy: 84.6 %
# Creating a confusion matrix
dt_confusion_matrix <- table(Predicted = valid_dt_predictions, Actual = valid$income)
print(dt_confusion_matrix)
## Actual
## Predicted <=50K >50K
## <=50K 7057 1150
## >50K 358 1202
#' ====================================== =
# Plotting confusion matrix
confusion_matrix_to_dataframe <- function(cm) {
df <- as.data.frame.table(cm)
colnames(df) <- c("Predicted", "Actual", "Count")
df
}
# Converting the confusion matrix to a tidy data frame
dataframe_rf <- confusion_matrix_to_dataframe(confusion_matrix_rf)
# Creating a heatmap for the confusion matrix
ggplot(dataframe_rf, aes(x = Actual, y = Predicted, fill = Count)) +
geom_tile(color = "white") +
geom_text(aes(label = Count), color = "black", size = 4) +
labs(title = "Confusion Matrix (Random Forest)",
x = "Actual Income",
y = "Predicted Income",
fill = "Count") +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal()

# Converting the confusion matrix to a tidy data frame
dataframe_dt <- confusion_matrix_to_dataframe(dt_confusion_matrix)
# Creating a heatmap for the confusion matrix
ggplot(dataframe_dt, aes(x = Actual, y = Predicted, fill = Count)) +
geom_tile(color = "white") +
geom_text(aes(label = Count), color = "black", size = 4) +
labs(title = "Confusion Matrix (Decision Tree)",
x = "Actual Income",
y = "Predicted Income",
fill = "Count") +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal()

#' ====================================== =
# Deploying model
test <- select(test, -income)
# Predicting income for the test dataset
test_predictions <- predict(rf_model, test)
# Combining the test data with the predictions
test_results <- cbind(test, Predicted_Income = test_predictions)
print(test_results)
## age workclass fnlwgt education education.num
## 1 52 Private 61735 11th 7
## 2 45 Local-gov 160187 HS-grad 9
## 3 29 Local-gov 214706 HS-grad 9
## 4 56 Private 85018 HS-grad 9
## 5 75 Self-emp-not-inc 192813 Masters 14
## 6 49 Private 185041 Masters 14
## 7 23 Private 118693 Bachelors 13
## 8 43 Self-emp-not-inc 47261 Some-college 10
## 9 39 Local-gov 357962 Assoc-acdm 12
## 10 30 Federal-gov 321990 Some-college 10
## marital.status occupation relationship race sex
## 1 Married-civ-spouse Other-service Husband White Male
## 2 Married-spouse-absent Adm-clerical Unmarried Black Female
## 3 Never-married Other-service Unmarried White Female
## 4 Never-married Adm-clerical Not-in-family White Female
## 5 Widowed Sales Not-in-family White Male
## 6 Married-civ-spouse Exec-managerial Husband White Male
## 7 Never-married Other-service Not-in-family White Male
## 8 Married-civ-spouse Farming-fishing Husband White Male
## 9 Never-married Transport-moving Not-in-family White Male
## 10 Married-civ-spouse Adm-clerical Husband White Male
## capital.gain capital.loss hours.per.week native.country Predicted_Income
## 1 0 0 40 United-States <=50K
## 2 0 0 40 United-States <=50K
## 3 0 0 40 United-States <=50K
## 4 0 0 40 United-States <=50K
## 5 0 0 45 United-States <=50K
## 6 0 1977 40 United-States >50K
## 7 0 0 35 United-States <=50K
## 8 0 0 50 United-States <=50K
## 9 0 0 48 United-States <=50K
## 10 7298 0 48 Cuba >50K
library(ggplot2)
# Creating a bar graph for the predictions
ggplot(test_results, aes(x = rownames(test_results), y = Predicted_Income, fill = Predicted_Income)) +
geom_bar(stat = "identity") +
labs(title = "Income Predictions for Test Individuals",
x = "Individual ID",
y = "Predicted Income",
fill = "Income Class") +
theme_minimal()

#' ====================================== =
#' Dashboard
# Loading gridExtra package
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
# Create a heatmap for the random forest confusion matrix
rf_cm_dataframe <- confusion_matrix_to_dataframe(confusion_matrix_rf)
rf_heatmap <- ggplot(rf_cm_dataframe, aes(x = Actual, y = Predicted, fill = Count)) +
geom_tile(color = "white") +
geom_text(aes(label = Count), color = "black", size = 4) +
labs(title = "Random Forest Confusion Matrix",
x = "Actual Income",
y = "Predicted Income",
fill = "Count") +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal()
# Create a heatmap for the decision tree confusion matrix
dt_cm_dataframe <- confusion_matrix_to_dataframe(dt_confusion_matrix)
dt_heatmap <- ggplot(dt_cm_dataframe, aes(x = Actual, y = Predicted, fill = Count)) +
geom_tile(color = "white") +
geom_text(aes(label = Count), color = "black", size = 4) +
labs(title = "Decision Tree Confusion Matrix",
x = "Actual Income",
y = "Predicted Income",
fill = "Count") +
scale_fill_gradient(low = "white", high = "lightgreen") +
theme_minimal()
predictions_bar <- ggplot(test_results, aes(x = rownames(test_results), y = Predicted_Income, fill = Predicted_Income)) +
geom_bar(stat = "identity") +
labs(title = "Income Predictions for Test Individuals",
x = "Individual ID",
y = "Predicted Income",
fill = "Income Class") +
theme_minimal() +
scale_fill_manual(values = c("#7b2cbf", "#d7b5e8"))
# Install the ggtext package (if not already installed)
if (!requireNamespace("ggtext", quietly = TRUE)) {
install.packages("ggtext")
}
# Load the ggtext package
library(ggtext)
## Warning: package 'ggtext' was built under R version 4.1.3
summary_text <- "
<p><strong>Task Summary:</strong></p>
<p>This dashboard presents the results of predicting whether an individual's income exceeds </p>
<p>$50K per year using two classification algorithms: Random Forest and Decision Tree.</p>
<p>The confusion matrices for both algorithms show the performance in terms of correct and </p>
<p>incorrect predictions made on the validation dataset. A higher value on the diagonal indicates </p>
<p>better performance.</p>
<p>The bar graph in the third quadrant displays the predicted income classes for 10 randomly </p>
<p>selected individuals from the dataset using the chosen algorithm.</p>
"
library(grid)
summary_plot <- ggplot() +
geom_blank() +
theme_void() +
theme(plot.margin = grid::unit(c(10, 10, 10, 10), "pt")) +
ggtext::geom_richtext(
aes(x = 0.5, y = 0.5, label = summary_text),
hjust = 0.5,
vjust = 0.5,
fill = "white",
label.color = NA,
size = 4,
)
# Create a 4 quadrant dashboard
dashboard <- gridExtra::grid.arrange(
rf_heatmap, dt_heatmap,
predictions_bar, summary_plot,
ncol = 2, nrow = 2
)
