#PROJECT 1

# https://archive.ics.uci.edu/ml/datasets/adult


#' Introduction & Instructions 

#' Predict whether income exceeds $50K/yr based on census data. 
#' Extraction by Barry Becker from the 1994 Census database.

#' Examine the accuracy of two algos with the census data below.  The data is 
#' census data originally posted on the UCI repository.  
#' 
#' Use the routine we used in class to generate graphs for the confusion matrix
#' associated with each of the two algos.  

#' Deploy the chosen algo to predict whether income exceeds $50K/yr for the 10 
#' individuals in the data set new_individuals.
#' Post the results of the prediction in a table and generate a graph.
 
#' The deliverable is a 4 quadrant dashboard with the two confusion matrix and
#' the predicted results.
#' the remaining quadrant should provide an explanation and succinct summary of the
#' task.
 
#' Upload to a bucket called Project1_DM_Dash by Wednesday March 29, COB.
 
#' ====================================== =


# setwd("C:/Users/arodriguez/Dropbox/classes/DataMining_Spr23/Project_1")
options(digits = 3, scipen = 9999)
remove(list = ls())
graphics.off()

pacman::p_load(magrittr, tidyverse, useful, janitor, class, randomForest)
pacman::p_load(yardstick)

#' ====================================== =

      
data <- read.csv("census.csv")  

data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$native.country <- as.factor(data$native.country)
data$income <- as.factor(data$income)


#' ====================================== =

library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(123) # Setting seed for reproducibility
index <- createDataPartition(data$income, p = 0.7, list = FALSE) # Splitting data into 70% training and 30% validation
train <- data[index, ] # Training dataset
valid <- data[-index, ] # Validation dataset
test <- data %>% sample_n(10) # Randomly sample 10 rows from the data dataframe

#' ====================================== =
# Training the random forest model
rf_model <- randomForest(income ~ ., data = train, ntree = 150, importance = TRUE)

# Predicting the validation dataset
valid_predictions <- predict(rf_model, valid)

# Comparing the predicted values with the actual values
comparison <- valid_predictions == valid$income

# Calculating the accuracy
accuracy <- mean(comparison)
cat("Accuracy:", accuracy * 100, "%\n")
## Accuracy: 86.2 %
# Creating a confusion matrix
confusion_matrix_rf <- table(Predicted = valid_predictions, Actual = valid$income)
print(confusion_matrix_rf)
##          Actual
## Predicted  <=50K  >50K
##     <=50K   6928   865
##     >50K     487  1487
#' ====================================== =
# Training the decision tree model

library(rpart)

# Training the decision tree model with tuned parameters
tuned_cp <- 0.01 # Replace with your tuned cp value
tuned_minsplit <- 20 # Replace with your tuned minsplit value
dt_model <- rpart(income ~ ., data = train, control = rpart.control(cp = tuned_cp, minsplit = tuned_minsplit))

# Predicting the validation dataset
valid_dt_predictions <- predict(dt_model, valid, type = "class")

# Comparing the predicted values with the actual values
dt_comparison <- valid_dt_predictions == valid$income

# Calculating the accuracy
dt_accuracy <- mean(dt_comparison)
cat("Decision Tree Accuracy:", dt_accuracy * 100, "%\n")
## Decision Tree Accuracy: 84.6 %
# Creating a confusion matrix
dt_confusion_matrix <- table(Predicted = valid_dt_predictions, Actual = valid$income)
print(dt_confusion_matrix)
##          Actual
## Predicted  <=50K  >50K
##     <=50K   7057  1150
##     >50K     358  1202
#' ====================================== =

# Plotting confusion matrix
confusion_matrix_to_dataframe <- function(cm) {
  df <- as.data.frame.table(cm)
  colnames(df) <- c("Predicted", "Actual", "Count")
  df
}


# Converting the confusion matrix to a tidy data frame
dataframe_rf <- confusion_matrix_to_dataframe(confusion_matrix_rf) 

# Creating a heatmap for the confusion matrix
ggplot(dataframe_rf, aes(x = Actual, y = Predicted, fill = Count)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Count), color = "black", size = 4) +
  labs(title = "Confusion Matrix (Random Forest)",
       x = "Actual Income",
       y = "Predicted Income",
       fill = "Count") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal()

# Converting the confusion matrix to a tidy data frame
dataframe_dt <- confusion_matrix_to_dataframe(dt_confusion_matrix) 

# Creating a heatmap for the confusion matrix
ggplot(dataframe_dt, aes(x = Actual, y = Predicted, fill = Count)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Count), color = "black", size = 4) +
  labs(title = "Confusion Matrix (Decision Tree)",
       x = "Actual Income",
       y = "Predicted Income",
       fill = "Count") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal()

#' ====================================== =
# Deploying model

test <- select(test, -income)

# Predicting income for the test dataset
test_predictions <- predict(rf_model, test)

# Combining the test data with the predictions
test_results <- cbind(test, Predicted_Income = test_predictions)

print(test_results)
##    age         workclass fnlwgt     education education.num
## 1   52           Private  61735          11th             7
## 2   45         Local-gov 160187       HS-grad             9
## 3   29         Local-gov 214706       HS-grad             9
## 4   56           Private  85018       HS-grad             9
## 5   75  Self-emp-not-inc 192813       Masters            14
## 6   49           Private 185041       Masters            14
## 7   23           Private 118693     Bachelors            13
## 8   43  Self-emp-not-inc  47261  Some-college            10
## 9   39         Local-gov 357962    Assoc-acdm            12
## 10  30       Federal-gov 321990  Some-college            10
##            marital.status        occupation   relationship   race     sex
## 1      Married-civ-spouse     Other-service        Husband  White    Male
## 2   Married-spouse-absent      Adm-clerical      Unmarried  Black  Female
## 3           Never-married     Other-service      Unmarried  White  Female
## 4           Never-married      Adm-clerical  Not-in-family  White  Female
## 5                 Widowed             Sales  Not-in-family  White    Male
## 6      Married-civ-spouse   Exec-managerial        Husband  White    Male
## 7           Never-married     Other-service  Not-in-family  White    Male
## 8      Married-civ-spouse   Farming-fishing        Husband  White    Male
## 9           Never-married  Transport-moving  Not-in-family  White    Male
## 10     Married-civ-spouse      Adm-clerical        Husband  White    Male
##    capital.gain capital.loss hours.per.week native.country Predicted_Income
## 1             0            0             40  United-States            <=50K
## 2             0            0             40  United-States            <=50K
## 3             0            0             40  United-States            <=50K
## 4             0            0             40  United-States            <=50K
## 5             0            0             45  United-States            <=50K
## 6             0         1977             40  United-States             >50K
## 7             0            0             35  United-States            <=50K
## 8             0            0             50  United-States            <=50K
## 9             0            0             48  United-States            <=50K
## 10         7298            0             48           Cuba             >50K
library(ggplot2)

# Creating a bar graph for the predictions
ggplot(test_results, aes(x = rownames(test_results), y = Predicted_Income, fill = Predicted_Income)) +
  geom_bar(stat = "identity") +
  labs(title = "Income Predictions for Test Individuals",
       x = "Individual ID",
       y = "Predicted Income",
       fill = "Income Class") +
  theme_minimal()

#' ====================================== =
#' Dashboard

# Loading gridExtra package
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
# Create a heatmap for the random forest confusion matrix
rf_cm_dataframe <- confusion_matrix_to_dataframe(confusion_matrix_rf)
rf_heatmap <- ggplot(rf_cm_dataframe, aes(x = Actual, y = Predicted, fill = Count)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Count), color = "black", size = 4) +
  labs(title = "Random Forest Confusion Matrix",
       x = "Actual Income",
       y = "Predicted Income",
       fill = "Count") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal()

# Create a heatmap for the decision tree confusion matrix
dt_cm_dataframe <- confusion_matrix_to_dataframe(dt_confusion_matrix)
dt_heatmap <- ggplot(dt_cm_dataframe, aes(x = Actual, y = Predicted, fill = Count)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Count), color = "black", size = 4) +
  labs(title = "Decision Tree Confusion Matrix",
       x = "Actual Income",
       y = "Predicted Income",
       fill = "Count") +
  scale_fill_gradient(low = "white", high = "lightgreen") +
  theme_minimal()

predictions_bar <- ggplot(test_results, aes(x = rownames(test_results), y = Predicted_Income, fill = Predicted_Income)) +
  geom_bar(stat = "identity") +
  labs(title = "Income Predictions for Test Individuals",
       x = "Individual ID",
       y = "Predicted Income",
       fill = "Income Class") +
  theme_minimal() +
  scale_fill_manual(values = c("#7b2cbf", "#d7b5e8"))






# Install the ggtext package (if not already installed)
if (!requireNamespace("ggtext", quietly = TRUE)) {
  install.packages("ggtext")
}

# Load the ggtext package
library(ggtext)
## Warning: package 'ggtext' was built under R version 4.1.3
summary_text <- "
<p><strong>Task Summary:</strong></p>
<p>This dashboard presents the results of predicting whether an individual's income exceeds </p>
<p>$50K per year using two classification algorithms: Random Forest and Decision Tree.</p>
<p>The confusion matrices for both algorithms show the performance in terms of correct and </p>
<p>incorrect predictions made on the validation dataset. A higher value on the diagonal indicates </p>
<p>better performance.</p>
<p>The bar graph in the third quadrant displays the predicted income classes for 10 randomly </p>
<p>selected individuals from the dataset using the chosen algorithm.</p>
"

library(grid)


summary_plot <- ggplot() +
  geom_blank() +
  theme_void() +
  theme(plot.margin = grid::unit(c(10, 10, 10, 10), "pt")) +
  ggtext::geom_richtext(
    aes(x = 0.5, y = 0.5, label = summary_text),
    hjust = 0.5,
    vjust = 0.5,
    fill = "white",
    label.color = NA,
    size = 4,
  )




# Create a 4 quadrant dashboard
dashboard <- gridExtra::grid.arrange(
  rf_heatmap, dt_heatmap,
  predictions_bar, summary_plot,
  ncol = 2, nrow = 2
)