RUSSELL_DAS211

# RUSSELL_DAS211_HW3.R
# Chapters 1-8-ish of Machine Learning for Data Scientists
# Date: October 30, 2025
#
# SMOOTH INSTALLATION FUNCTION - No more install prompts!
install_if_missing <- function(packages) {
  for (pkg in packages) {
    if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
      cat("Installing missing package:", pkg, "\n")
      install.packages(pkg, dependencies = TRUE, repos = "https://cran.rstudio.com/")
      library(pkg, character.only = TRUE)
    }
  }
}

# Install required packages smoothly
required_packages <- c(
  "tidyverse", "class", "caret", "vroom", "naivebayes",
  "cluster", "nnet", "randomForest", "fontawesome"
)

cat("🔧 Checking and installing required packages...\n")

## 🔧 Checking and installing required packages...

install_if_missing(required_packages)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## 
## Attaching package: 'vroom'
## 
## 
## The following objects are masked from 'package:readr':
## 
##     as.col_spec, col_character, col_date, col_datetime, col_double,
##     col_factor, col_guess, col_integer, col_logical, col_number,
##     col_skip, col_time, cols, cols_condense, cols_only, date_names,
##     date_names_lang, date_names_langs, default_locale, fwf_cols,
##     fwf_empty, fwf_positions, fwf_widths, locale, output_column,
##     problems, spec
## 
## 
## naivebayes 1.0.0 loaded
## 
## For more information please visit: 
## 
## https://majkamichal.github.io/naivebayes/
## 
## randomForest 4.7-1.2
## 
## Type rfNews() to see new features/changes/bug fixes.
## 
## 
## Attaching package: 'randomForest'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

# NOTE FOR TEACHER: package e1071 is not available for the latest
# version of R. Updated R due to constant popup warnings, but now regret it.
# SOLUTION: Use naivebayes package as e1071 replacement for Naive Bayes functionality

# Libraries are already loaded by install_if_missing function above
cat("✅ All packages loaded successfully!\n")

## ✅ All packages loaded successfully!

# Set working directory
setwd("/Users/brian.russell/Downloads/VSC 3")

# NOTE FOR TEACHER: package e1071 is not available for the latest
# version of R. Updated R due to constant popup warnings, but now regret it.
# TROUBLESHOOTING PROCESS:
# 1. Initially tried loading e1071 dependencies from CRAN source page
# 2. Hit parsing errors with long install.packages() lines - suspected line limits
# 3. Tried nolint commands - didn't work (that's for linters, not R parser)
# 4. Discovered the real issue: R session restart after package install
#    was clearing variables, so moved library() calls before variable assignment
# 5. Fixed syntax error: library(pkg1, pkg2, ...) is invalid - each needs own library() call
# 6. SOLUTION: Use naivebayes package as e1071 replacement for Naive Bayes functionality
# pkgs <- c("cluster", "mlbench", "nnet", "randomForest", "rpart", "SparseM", "xtable", "Matrix", "MASS", "slam")
# install.packages(pkgs)

# Install naivebayes as e1071 replacement
# install.packages("naivebayes")

# Load libraries
library(class)
library(caret)
library(vroom)
library(naivebayes) # Replacement for e1071's naiveBayes function
# Load some e1071 dependencies that might be useful
library(cluster)
library(nnet)
library(randomForest)
library(fontawesome)

# Set working directory
setwd("~/Library/CloudStorage/GoogleDrive-brian.russell@gallaudet.edu/My Drive/Gallaudet classes fall 2025/Machine Learning/R working directory/Assignment 3")

# ==================== PART 1: GALLSTONES ANALYSIS ====================

# 1. Load gallstones data
cat("*** Loading gallstones dataset ***\n")

## *** Loading gallstones dataset ***

gallstones <- read_csv("gallstones.csv")

## Rows: 319 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): Gallstone_Status, Age, Gender, Comorbidity, Coronary_Artery_Diseas...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Get a glimpse of the data
head(gallstones)

## # A tibble: 6 × 39
##   Gallstone_Status   Age Gender Comorbidity Coronary_Artery_Disease
##              <dbl> <dbl>  <dbl>       <dbl>                   <dbl>
## 1                0    50      0           0                       0
## 2                0    47      0           1                       0
## 3                0    61      0           0                       0
## 4                0    41      0           0                       0
## 5                0    42      0           0                       0
## 6                0    96      0           0                       0
## # ℹ 34 more variables: Hypothyroidism <dbl>, Hyperlipidemia <dbl>,
## #   Diabetes_Mellitus <dbl>, Height <dbl>, Weight <dbl>, BMI <dbl>,
## #   Total_Body_Water <dbl>, Extracellular_Water <dbl>,
## #   Intracellular_Water <dbl>,
## #   `Extracellular_Fluid _Total_Body_Water_ratio` <dbl>,
## #   Total_Body_Fat_Ratio <dbl>, Lean_Mass <dbl>, Body_Protein_Content <dbl>,
## #   Visceral_Fat_Rating <dbl>, Bone_Mass <dbl>, Muscle_Mass <dbl>, …

glimpse(gallstones)

## Rows: 319
## Columns: 39
## $ Gallstone_Status                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Age                                           <dbl> 50, 47, 61, 41, 42, 96, …
## $ Gender                                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Comorbidity                                   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ Coronary_Artery_Disease                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Hypothyroidism                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Hyperlipidemia                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Diabetes_Mellitus                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Height                                        <dbl> 185, 176, 171, 168, 178,…
## $ Weight                                        <dbl> 92.8, 94.5, 91.1, 67.7, …
## $ BMI                                           <dbl> 27.1, 30.5, 31.2, 24.0, …
## $ Total_Body_Water                              <dbl> 52.9, 43.1, 47.2, 41.4, …
## $ Extracellular_Water                           <dbl> 21.2, 19.5, 20.1, 17.0, …
## $ Intracellular_Water                           <dbl> 31.7, 23.6, 27.1, 24.4, …
## $ `Extracellular_Fluid _Total_Body_Water_ratio` <dbl> 40, 45, 43, 41, 39, 46, …
## $ Total_Body_Fat_Ratio                          <dbl> 19.2, 32.8, 27.3, 15.8, …
## $ Lean_Mass                                     <dbl> 80.84, 67.20, 72.67, 84.…
## $ Body_Protein_Content                          <dbl> 18.88, 16.68, 16.35, 16.…
## $ Visceral_Fat_Rating                           <dbl> 9, 15, 15, 6, 8, 12, 3, …
## $ Bone_Mass                                     <dbl> 3.7, 3.2, 3.3, 2.9, 3.5,…
## $ Muscle_Mass                                   <dbl> 71.4, 60.3, 62.9, 54.1, …
## $ Obesity                                       <dbl> 23.4, 38.8, 41.7, 9.0, 2…
## $ Total_Fat_Content                             <dbl> 17.8, 31.0, 24.9, 10.7, …
## $ Visceral_Fat_Area                             <dbl> 10.6, 18.4, 16.2, 6.5, 1…
## $ `Visceral_Muscle_Are _Kg`                     <dbl> 39.7, 32.7, 34.0, 29.2, …
## $ Hepatic_Fat_Accumulation                      <dbl> 0, 0, 0, 1, 2, 0, 0, 3, …
## $ Glucose                                       <dbl> 102, 94, 103, 69, 109, 7…
## $ Total_Cholesterol                             <dbl> 250, 172, 179, 173, 205,…
## $ Low_Density_Lipoprotein                       <dbl> 175.0, 108.0, 124.0, 73.…
## $ High_Density_Lipoprotein                      <dbl> 40, 43, 43, 59, 30, 30, …
## $ Triglyceride                                  <dbl> 134, 103, 69, 53, 326, 6…
## $ Aspartat_Aminotransferase                     <dbl> 20, 14, 18, 20, 27, 13, …
## $ Alanin_Aminotransferase                       <dbl> 22, 13, 14, 12, 54, 13, …
## $ Alkaline_Phosphatase                          <dbl> 87, 46, 66, 34, 71, 60, …
## $ Creatinine                                    <dbl> 0.82, 0.87, 1.25, 1.02, …
## $ Glomerular_Filtration_Rate                    <dbl> 112.47, 107.10, 65.51, 9…
## $ C_Reactive_Protein                            <dbl> 0.00, 0.00, 0.00, 0.00, …
## $ Hemoglobin                                    <dbl> 16.0, 14.4, 16.2, 15.4, …
## $ Vitamin_D                                     <dbl> 33.00000, 25.00000, 30.2…

cat("\n*** Step 1: Dataset loaded! ***\n\n")

## 
## *** Step 1: Dataset loaded! ***

# 2. Normalize data using min/max normalization
cat("*** Normalizing gallstones data using min/max method ***\n")

## *** Normalizing gallstones data using min/max method ***

# Simple normalization function
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Apply normalization to numeric columns (excluding target)
gallstones_normalized <- gallstones %>%
  mutate_if(is.numeric, ~ ifelse(. == Gallstone_Status, ., normalize(.)))

# Show comparison
cat("Original summary (first few col):\n")

## Original summary (first few col):

summary(gallstones[2:6])

##       Age            Gender        Comorbidity     Coronary_Artery_Disease
##  Min.   :20.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000        
##  1st Qu.:38.50   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000        
##  Median :49.00   Median :0.0000   Median :0.0000   Median :0.00000        
##  Mean   :48.07   Mean   :0.4922   Mean   :0.3354   Mean   :0.03762        
##  3rd Qu.:56.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000        
##  Max.   :96.00   Max.   :1.0000   Max.   :3.0000   Max.   :1.00000        
##  Hypothyroidism   
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.02821  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

cat("Normalized summary (same col):\n")

## Normalized summary (same col):

summary(gallstones_normalized[2:6])

##       Age             Gender        Comorbidity     Coronary_Artery_Disease
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000        
##  1st Qu.:0.2434   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000        
##  Median :0.3816   Median :0.0000   Median :0.0000   Median :0.00000        
##  Mean   :0.3693   Mean   :0.4922   Mean   :0.2142   Mean   :0.03762        
##  3rd Qu.:0.4737   3rd Qu.:1.0000   3rd Qu.:0.3333   3rd Qu.:0.00000        
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000        
##  Hypothyroidism   
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.02821  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

cat("\n*** Step 2: Data norming completed ***\n\n")

## 
## *** Step 2: Data norming completed ***

# 3. Check for categorical data and create dummy variables if needed
cat("*** Check cat data 🐱🐱🐱\n")

## *** Check cat data 🐱🐱🐱

# Check data types
str(gallstones_normalized)

## tibble [319 × 39] (S3: tbl_df/tbl/data.frame)
##  $ Gallstone_Status                           : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Age                                        : num [1:319] 0.395 0.355 0.539 0.276 0.289 ...
##  $ Gender                                     : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Comorbidity                                : num [1:319] 0 0.333 0 0 0 ...
##  $ Coronary_Artery_Disease                    : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hypothyroidism                             : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyperlipidemia                             : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Diabetes_Mellitus                          : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Height                                     : num [1:319] 0.87 0.674 0.565 0.5 0.717 ...
##  $ Weight                                     : num [1:319] 0.496 0.513 0.479 0.247 0.464 ...
##  $ BMI                                        : num [1:319] 0.3 0.406 0.427 0.204 0.337 ...
##  $ Total_Body_Water                           : num [1:319] 0.75 0.566 0.643 0.534 0.722 ...
##  $ Extracellular_Water                        : num [1:319] 0.649 0.559 0.59 0.426 0.585 ...
##  $ Intracellular_Water                        : num [1:319] 0.413 0.226 0.307 0.245 0.406 ...
##  $ Extracellular_Fluid _Total_Body_Water_ratio: num [1:319] 0.473 0.693 0.605 0.517 0.429 ...
##  $ Total_Body_Fat_Ratio                       : num [1:319] 0.289 0.594 0.471 0.213 0.307 ...
##  $ Lean_Mass                                  : num [1:319] 0.713 0.408 0.53 0.788 0.694 ...
##  $ Body_Protein_Content                       : num [1:319] 0.692 0.578 0.561 0.589 0.584 ...
##  $ Visceral_Fat_Rating                        : num [1:319] 0.267 0.467 0.467 0.167 0.233 ...
##  $ Bone_Mass                                  : num [1:319] 0.885 0.692 0.731 0.577 0.808 ...
##  $ Muscle_Mass                                : num [1:319] 0.9 0.75 0.785 0.667 0.857 ...
##  $ Obesity                                    : num [1:319] 0.0118 0.0197 0.0211 0.0044 0.0144 ...
##  $ Total_Fat_Content                          : num [1:319] 0.247 0.47 0.367 0.128 0.249 ...
##  $ Visceral_Fat_Area                          : num [1:319] 0.242 0.436 0.382 0.14 0.237 ...
##  $ Visceral_Muscle_Are _Kg                    : num [1:319] 0.937 0.622 0.68 0.464 0.833 ...
##  $ Hepatic_Fat_Accumulation                   : num [1:319] 0 0 0 0.25 0.5 0 0 0.75 0.5 0 ...
##  $ Glucose                                    : num [1:319] 0.0652 0.0494 0.0672 0 0.0791 ...
##  $ Total_Cholesterol                          : num [1:319] 0.633 0.373 0.397 0.377 0.483 ...
##  $ Low_Density_Lipoprotein                    : num [1:319] 0.582 0.344 0.401 0.22 0.507 ...
##  $ High_Density_Lipoprotein                   : num [1:319] 0.0605 0.0726 0.0726 0.1371 0.0202 ...
##  $ Triglyceride                               : num [1:319] 0.1585 0.1215 0.0808 0.0617 0.388 ...
##  $ Aspartat_Aminotransferase                  : num [1:319] 0.0642 0.0321 0.0535 0.0642 0.1016 ...
##  $ Alanin_Aminotransferase                    : num [1:319] 0.0515 0.0271 0.0298 0.0244 0.1382 ...
##  $ Alkaline_Phosphatase                       : num [1:319] 0.421 0.205 0.311 0.142 0.337 ...
##  $ Creatinine                                 : num [1:319] 0.36 0.41 0.79 0.56 0.36 1 0.31 0.84 0.45 0.45 ...
##  $ Glomerular_Filtration_Rate                 : num [1:319] 0.839 0.795 0.452 0.688 0.839 ...
##  $ C_Reactive_Protein                         : num [1:319] 0 0 0 0 0 ...
##  $ Hemoglobin                                 : num [1:319] 0.728 0.573 0.748 0.67 0.806 ...
##  $ Vitamin_D                                  : num [1:319] 0.595 0.433 0.538 0.643 0.748 ...

# All data appears to be numeric, so no dummy variables needed
gallstones_processed <- gallstones_normalized
cat("No stray cats found - proceeding with normalized data\n")

## No stray cats found - proceeding with normalized data

cat("Dataset dimensions:", dim(gallstones_processed), "\n")

## Dataset dimensions: 319 39

cat("\n*** Step 3: Data preparation completed ***\n\n")

## 
## *** Step 3: Data preparation completed ***

# 4. Create training and test datasets (following Ch7 approach)
cat("*** Creating training and test datasets for gallstones ***\n")

## *** Creating training and test datasets for gallstones ***

# Convert target to factor
gallstones_processed <- gallstones_processed %>%
  mutate(Gallstone_Status = as.factor(Gallstone_Status))

# Using sample() function like in Chapter 7
set.seed(1234)
sample_set <- sample(nrow(gallstones_processed), round(nrow(gallstones_processed) * 0.75), replace = FALSE)
gallstones_train <- gallstones_processed[sample_set, ]
gallstones_test <- gallstones_processed[-sample_set, ]

cat("Training set dimensions:", dim(gallstones_train), "\n")

## Training set dimensions: 239 39

cat("Test set dimensions:", dim(gallstones_test), "\n")

## Test set dimensions: 80 39

# Check class proportions
round(prop.table(table(gallstones_processed$Gallstone_Status)), 2)

## 
##   0   1 
## 0.5 0.5

round(prop.table(table(gallstones_train$Gallstone_Status)), 2)

## 
##    0    1 
## 0.52 0.48

round(prop.table(table(gallstones_test$Gallstone_Status)), 2)

## 
##    0    1 
## 0.45 0.55

# 5. Run K-nearest neighbors model (simplified)
cat("*** Running K-nearest neighbors model ***\n")

## *** Running K-nearest neighbors model ***

# Prepare data for KNN
train_features <- gallstones_train[, -1] # Remove target column
train_labels <- gallstones_train$Gallstone_Status # Target column as vector
test_features <- gallstones_test[, -1]
test_labels <- gallstones_test$Gallstone_Status # Target column as vector

# Run KNN with k=5
knn_pred <- knn(train = train_features, test = test_features, cl = train_labels, k = 5)

# Create confusion matrix
knn_table <- table(test_labels, knn_pred)
print(knn_table)

##            knn_pred
## test_labels  0  1
##           0 32  4
##           1 18 26

# Calculate accuracy
knn_accuracy <- sum(diag(knn_table)) / nrow(gallstones_test)
cat("KNN Accuracy:", round(knn_accuracy * 100, 2), "%\n")

## KNN Accuracy: 72.5 %

cat("\n*** Steps 4 & 5: KNN model completed ***\n\n")

## 
## *** Steps 4 & 5: KNN model completed ***

# Model improvement suggestions (simplified)
cat("*** Model Evaluation and Improvement Suggestions ***\n")

## *** Model Evaluation and Improvement Suggestions ***

cat("KNN Accuracy:", round(knn_accuracy * 100, 2), "%\n")

## KNN Accuracy: 72.5 %

cat("\nWays to improve model:\n")

## 
## Ways to improve model:

cat("1. Try different k values\n")

## 1. Try different k values

cat("2. Feature selection\n")

## 2. Feature selection

cat("3. Different algorithms\n")

## 3. Different algorithms

cat("4. Cross-validation\n")

## 4. Cross-validation

cat("\n*** Gallstones analysis completed ***\n\n")

## 
## *** Gallstones analysis completed ***

# ==================== PART 2: ADULT WORKING CATS DATASET ANALYSIS ====================
# NARRATIVE: Analyzing a dataset of adult working cats in search of predictive logic
# to determine which hardworking felines will eventually become the chonky "fat cats"
# (high earners >50K) vs those who remain regular working cats (≤50K)

# 6. Load adult working cats dataset
cat("🐱🐱🐱 Loading adult working cats dataset 🐱🐱🐱\n")

## 🐱🐱🐱 Loading adult working cats dataset 🐱🐱🐱

# Load the data
adult <- read_csv("adult.csv")

## Rows: 32561 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Workclass, Education, Marital_Status, Occupation, Relationship, Rac...
## dbl (6): Age, ID, Education_Code, Capital_Gain, Capital_Loss, Hours_per_Week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cat("Clean the cat box (remove leading/trailing whitespace \n")

## Clean the cat box (remove leading/trailing whitespace

adult <- adult %>%
  mutate_if(is.character, str_trim)

cat("Adult dataset dimensions:", dim(adult), "\n")

## Adult dataset dimensions: 32561 15

cat("Data structure:\n")

## Data structure:

str(adult)

## tibble [32,561 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Age           : num [1:32561] 39 50 38 53 28 37 49 52 31 42 ...
##  $ Workclass     : chr [1:32561] "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ ID            : num [1:32561] 77516 83311 215646 234721 338409 ...
##  $ Education     : chr [1:32561] "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ Education_Code: num [1:32561] 13 13 9 7 13 14 5 9 14 13 ...
##  $ Marital_Status: chr [1:32561] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
##  $ Occupation    : chr [1:32561] "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ Relationship  : chr [1:32561] "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ Race          : chr [1:32561] "White" "White" "White" "Black" ...
##  $ Gender        : chr [1:32561] "Male" "Male" "Male" "Male" ...
##  $ Capital_Gain  : num [1:32561] 2174 0 0 0 0 ...
##  $ Capital_Loss  : num [1:32561] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hours_per_Week: num [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
##  $ Native_Country: chr [1:32561] "United-States" "United-States" "United-States" "United-States" ...
##  $ Salary        : chr [1:32561] "<=50K" "<=50K" "<=50K" "<=50K" ...

cat("First few rows:\n")

## First few rows:

print(head(adult))

## # A tibble: 6 × 15
##     Age Workclass          ID Education Education_Code Marital_Status Occupation
##   <dbl> <chr>           <dbl> <chr>              <dbl> <chr>          <chr>     
## 1    39 State-gov       77516 Bachelors             13 Never-married  Adm-cleri…
## 2    50 Self-emp-not-…  83311 Bachelors             13 Married-civ-s… Exec-mana…
## 3    38 Private        215646 HS-grad                9 Divorced       Handlers-…
## 4    53 Private        234721 11th                   7 Married-civ-s… Handlers-…
## 5    28 Private        338409 Bachelors             13 Married-civ-s… Prof-spec…
## 6    37 Private        284582 Masters               14 Married-civ-s… Exec-mana…
## # ℹ 8 more variables: Relationship <chr>, Race <chr>, Gender <chr>,
## #   Capital_Gain <dbl>, Capital_Loss <dbl>, Hours_per_Week <dbl>,
## #   Native_Country <chr>, Salary <chr>

cat("\n🐱🐱🐱 Step 6: meow. Adult dataset loaded successfully 🐱🐱🐱\n\n")

## 
## 🐱🐱🐱 Step 6: meow. Adult dataset loaded successfully 🐱🐱🐱

# 7. Divide adult kitty dataset into categorical and numerical subsets
cat("🐱🐱🐱 Dividing working cats into categorical traits vs numerical metrics 🐱🐱🐱\n")

## 🐱🐱🐱 Dividing working cats into categorical traits vs numerical metrics 🐱🐱🐱

# Identify categorical and numerical columns
numerical_adult_cols <- c("Age", "ID", "Education_Code", "Capital_Gain", "Capital_Loss", "Hours_per_Week")
categorical_adult_cols <- c(
  "Workclass", "Education", "Marital_Status", "Occupation",
  "Relationship", "Race", "Gender", "Native_Country"
)

# Create adult_cat: categorical features + Salary
adult_cat <- adult %>%
  select(all_of(categorical_adult_cols), Salary)

# Create adult_num: numerical features + Salary
adult_num <- adult %>%
  select(all_of(numerical_adult_cols), Salary)

cat("adult_cat dimensions:", dim(adult_cat), "\n")

## adult_cat dimensions: 32561 9

cat("adult_cat columns:", colnames(adult_cat), "\n")

## adult_cat columns: Workclass Education Marital_Status Occupation Relationship Race Gender Native_Country Salary

cat("adult_num dimensions:", dim(adult_num), "\n")

## adult_num dimensions: 32561 7

cat("adult_num columns:", colnames(adult_num), "\n")

## adult_num columns: Age ID Education_Code Capital_Gain Capital_Loss Hours_per_Week Salary

cat("\n🐱🐱🐱 Step 7: Herd of Adult Cats has been divided successfully 🐱🐱🐱\n\n")

## 
## 🐱🐱🐱 Step 7: Herd of Adult Cats has been divided successfully 🐱🐱🐱

# ==================== PART 3: ADULT_CAT ANALYSIS ====================

cat("🐱🐱🐱 Analyzing categorical cat traits to predict fat cat status: Who's a chonky boy, eh? 🐱🐱🐱\n")

## 🐱🐱🐱 Analyzing categorical cat traits to predict fat cat status: Who's a chonky boy, eh? 🐱🐱🐱

# 8a. Divide into training and test datasets
set.seed(1234)
adult_cat_train_index <- createDataPartition(adult_cat$Salary, p = 0.7, list = FALSE)
adult_cat_train <- adult_cat[adult_cat_train_index, ]
adult_cat_test <- adult_cat[-adult_cat_train_index, ]

cat("adult_cat training set dimensions:", dim(adult_cat_train), "\n")

## adult_cat training set dimensions: 22793 9

cat("adult_cat test set dimensions:", dim(adult_cat_test), "\n")

## adult_cat test set dimensions: 9768 9

# 8b. Check for class imbalance
cat("\nClass distribution in adult_cat training data:\n")

## 
## Class distribution in adult_cat training data:

salary_dist_train <- table(adult_cat_train$Salary)
print(salary_dist_train)

## 
## <=50K  >50K 
## 17304  5489

cat("Percentage distribution:\n")

## Percentage distribution:

print(round(prop.table(salary_dist_train) * 100, 2))

## 
## <=50K  >50K 
## 75.92 24.08

cat("\nClass distribution in adult_cat test data:\n")

## 
## Class distribution in adult_cat test data:

salary_dist_test <- table(adult_cat_test$Salary)
print(salary_dist_test)

## 
## <=50K  >50K 
##  7416  2352

cat("Percentage distribution:\n")

## Percentage distribution:

print(round(prop.table(salary_dist_test) * 100, 2))

## 
## <=50K  >50K 
## 75.92 24.08

# Check if data is not balanced (minority class < 20%)
minority_percent <- min(prop.table(salary_dist_train)) * 100
cat("\nMinority class percentage:", round(minority_percent, 2), "%\n")

## 
## Minority class percentage: 24.08 %

if (minority_percent < 20) {
  cat("Dataset shows class imbalance. SMOTE them or use class weights.\n")
} else {
  cat("Dataset appears reasonably balanced. Ok to proceed!\n")
}

## Dataset appears reasonably balanced. Ok to proceed!

# 8c. Run Naive Bayes classifier
cat("\n🐱🐱🐱 Running Naive Bayes model on adult_cat dataset 🐱🐱🐱\n")

## 
## 🐱🐱🐱 Running Naive Bayes model on adult_cat dataset 🐱🐱🐱

# Convert character columns to factors for naive bayes
# IMPORTANT: Ensure train and test have same factor levels to avoid prediction errors
adult_cat_train_nb <- adult_cat_train %>%
  mutate_if(is.character, as.factor)

adult_cat_test_nb <- adult_cat_test %>%
  mutate_if(is.character, as.factor)

# Align factor levels between training and test sets
for (col in names(adult_cat_train_nb)) {
  if (is.factor(adult_cat_train_nb[[col]])) {
    # Get all unique levels from both train and test
    all_levels <- unique(c(
      levels(adult_cat_train_nb[[col]]),
      levels(adult_cat_test_nb[[col]])
    ))
    # Set same levels for both datasets
    adult_cat_train_nb[[col]] <- factor(adult_cat_train_nb[[col]], levels = all_levels)
    adult_cat_test_nb[[col]] <- factor(adult_cat_test_nb[[col]], levels = all_levels)
  }
}

# Train Naive Bayes model (using naivebayes package instead of e1071)
# LAPLACE SMOOTHING CHOICE: Adding laplace=1 to handle zero probabilities
# Why Laplace instead of converting categorical to numeric ranges:
# 1. Preserves categorical meaning - "Masters" vs "HS-grad" aren't ordinal numbers
# 2. Handles rare combinations / outliers gracefully - adds small probability to unseen cases
# 3. Consistent interpretability - results still meaningful in original categories
# 4. Standard practice in NB - well-tested solution for this exact problem
# 5. Avoids arbitrary ordering - no need to assign values to "Private"=30 vs "Government"=60
nb_model_cat <- naive_bayes(Salary ~ ., data = adult_cat_train_nb, laplace = 1)

# 8d. Use model on test dataset
nb_pred_cat <- predict(nb_model_cat, adult_cat_test_nb[, -ncol(adult_cat_test_nb)])

# 8e. Calculate accuracy
nb_accuracy_cat <- sum(nb_pred_cat == adult_cat_test_nb$Salary) / nrow(adult_cat_test_nb)
nb_confusion_cat <- table(Predicted = nb_pred_cat, Actual = adult_cat_test_nb$Salary)
print(table) # because I want to know what the heck its doing

## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no", 
##     "ifany", "always"), dnn = list.names(...), deparse.level = 1) 
## {
##     list.names <- function(...) {
##         l <- as.list(substitute(list(...)))[-1L]
##         if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1))) 
##             return(nm)
##         nm <- names(l)
##         fixup <- if (is.null(nm)) 
##             seq_along(l)
##         else nm == ""
##         dep <- vapply(l[fixup], function(x) switch(deparse.level + 
##             1, "", if (is.symbol(x)) as.character(x) else "", 
##             deparse(x, nlines = 1)[1L]), "")
##         if (is.null(nm)) 
##             dep
##         else {
##             nm[fixup] <- dep
##             nm
##         }
##     }
##     miss.use <- missing(useNA)
##     miss.exc <- missing(exclude)
##     useNA <- if (miss.use && !miss.exc && !match(NA, exclude, 
##         nomatch = 0L)) 
##         "ifany"
##     else match.arg(useNA)
##     doNA <- useNA != "no"
##     if (!miss.use && !miss.exc && doNA && match(NA, exclude, 
##         nomatch = 0L)) 
##         warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
##     args <- list(...)
##     if (length(args) == 1L && is.list(args[[1L]])) {
##         args <- args[[1L]]
##         if (length(dnn) != length(args)) 
##             dnn <- paste(dnn[1L], seq_along(args), sep = ".")
##     }
##     if (!length(args)) 
##         stop("nothing to tabulate")
##     bin <- 0L
##     lens <- NULL
##     dims <- integer()
##     pd <- 1L
##     dn <- NULL
##     for (a in args) {
##         if (is.null(lens)) 
##             lens <- length(a)
##         else if (length(a) != lens) 
##             stop("all arguments must have the same length")
##         fact.a <- is.factor(a)
##         if (doNA) 
##             aNA <- anyNA(a)
##         if (!fact.a) {
##             a0 <- a
##             op <- options(warn = 2)
##             on.exit(options(op))
##             a <- factor(a, exclude = exclude)
##             options(op)
##         }
##         add.na <- doNA
##         if (add.na) {
##             ifany <- (useNA == "ifany")
##             anNAc <- anyNA(a)
##             add.na <- if (!ifany || anNAc) {
##                 ll <- levels(a)
##                 if (add.ll <- !anyNA(ll)) {
##                   ll <- c(ll, NA)
##                   TRUE
##                 }
##                 else if (!ifany && !anNAc) 
##                   FALSE
##                 else TRUE
##             }
##             else FALSE
##         }
##         if (add.na) 
##             a <- factor(a, levels = ll, exclude = NULL)
##         else ll <- levels(a)
##         a <- as.integer(a)
##         if (fact.a && !miss.exc) {
##             ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) == 
##                 0L)]
##             a <- match(a, keep)
##         }
##         else if (!fact.a && add.na) {
##             if (ifany && !aNA && add.ll) {
##                 ll <- ll[!is.na(ll)]
##                 is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) > 
##                   0L
##             }
##             else {
##                 is.na(a) <- match(a0, exclude, nomatch = 0L) > 
##                   0L
##             }
##         }
##         nl <- length(ll)
##         dims <- c(dims, nl)
##         if (prod(dims) > .Machine$integer.max) 
##             stop("attempt to make a table with >= 2^31 elements")
##         dn <- c(dn, list(ll))
##         bin <- bin + pd * (a - 1L)
##         pd <- pd * nl
##     }
##     names(dn) <- dnn
##     bin <- bin[!is.na(bin)]
##     if (length(bin)) 
##         bin <- bin + 1L
##     y <- array(tabulate(bin, pd), dims, dimnames = dn)
##     class(y) <- "table"
##     y
## }
## <bytecode: 0x1309330d8>
## <environment: namespace:base>

cat("Naive Bayes accuracy on adult_cat:", round(nb_accuracy_cat * 100, 2), "%\n")

## Naive Bayes accuracy on adult_cat: 79.78 %

cat("Confusion Matrix for adult_cat:\n")

## Confusion Matrix for adult_cat:

print(nb_confusion_cat)

##          Actual
## Predicted <=50K >50K
##     <=50K  6038  597
##     >50K   1378 1755

# Calculate additional metrics
precision_cat <- nb_confusion_cat[2, 2] / sum(nb_confusion_cat[2, ])
recall_cat <- nb_confusion_cat[2, 2] / sum(nb_confusion_cat[, 2])
f1_cat <- 2 * (precision_cat * recall_cat) / (precision_cat + recall_cat)

cat("Precision:", round(precision_cat, 3), "\n")

## Precision: 0.56

cat("Recall:", round(recall_cat, 3), "\n")

## Recall: 0.746

cat("F1-Score:", round(f1_cat, 3), "\n")

## F1-Score: 0.64

cat("\n🐱🐱🐱 Step 8: adult_cat analysis completed 🐱🐱🐱\n\n")

## 
## 🐱🐱🐱 Step 8: adult_cat analysis completed 🐱🐱🐱

# ==================== PART 4: ADULT_NUM ANALYSIS ====================

cat("🐱🐱🐱 Analyzing numerical cat metrics to predict eventual fat cat status 🐱🐱🐱\n")

## 🐱🐱🐱 Analyzing numerical cat metrics to predict eventual fat cat status 🐱🐱🐱

# 9. Repeat all steps for adult_num dataset

# 9a. Divide into training and test datasets, withholding SALARY
adult_num_train_index <- createDataPartition(adult_num$Salary, p = 0.7, list = FALSE)
adult_num_train <- adult_num[adult_num_train_index, ]
adult_num_test <- adult_num[-adult_num_train_index, ]

cat("adult_num training set dimensions:", dim(adult_num_train), "\n")

## adult_num training set dimensions: 22793 7

cat("adult_num test set dimensions:", dim(adult_num_test), "\n")

## adult_num test set dimensions: 9768 7

# 9b. Check for class imbalance / social injustice
cat("\nClass distribution in adult_num training data:\n")

## 
## Class distribution in adult_num training data:

salary_dist_num_train <- table(adult_num_train$Salary)
print(salary_dist_num_train)

## 
## <=50K  >50K 
## 17304  5489

cat("Percentage distribution:\n")

## Percentage distribution:

print(round(prop.table(salary_dist_num_train) * 100, 2))

## 
## <=50K  >50K 
## 75.92 24.08

# 9c. Run Naive Bayes classifier on numerical data
cat("\n🐱🐱🐱 Running Naive Bayes on adult_num 🐱🐱🐱\n")

## 
## 🐱🐱🐱 Running Naive Bayes on adult_num 🐱🐱🐱

# Convert Salary to factor
adult_num_train$Salary <- as.factor(adult_num_train$Salary)
adult_num_test$Salary <- as.factor(adult_num_test$Salary)

# Train Naive Bayes model (using naivebayes package instead of e1071)
# Adding Laplace smoothing for numerical features too
nb_model_num <- naive_bayes(Salary ~ ., data = adult_num_train, laplace = 1)

# 9d. Use model on test dataset
nb_pred_num <- predict(nb_model_num, adult_num_test[, -ncol(adult_num_test)])

# 9e. Calculate accuracy
nb_accuracy_num <- sum(nb_pred_num == adult_num_test$Salary) / nrow(adult_num_test)
nb_confusion_num <- table(Predicted = nb_pred_num, Actual = adult_num_test$Salary)
print(table) # because its interesting, thats why

## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no", 
##     "ifany", "always"), dnn = list.names(...), deparse.level = 1) 
## {
##     list.names <- function(...) {
##         l <- as.list(substitute(list(...)))[-1L]
##         if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1))) 
##             return(nm)
##         nm <- names(l)
##         fixup <- if (is.null(nm)) 
##             seq_along(l)
##         else nm == ""
##         dep <- vapply(l[fixup], function(x) switch(deparse.level + 
##             1, "", if (is.symbol(x)) as.character(x) else "", 
##             deparse(x, nlines = 1)[1L]), "")
##         if (is.null(nm)) 
##             dep
##         else {
##             nm[fixup] <- dep
##             nm
##         }
##     }
##     miss.use <- missing(useNA)
##     miss.exc <- missing(exclude)
##     useNA <- if (miss.use && !miss.exc && !match(NA, exclude, 
##         nomatch = 0L)) 
##         "ifany"
##     else match.arg(useNA)
##     doNA <- useNA != "no"
##     if (!miss.use && !miss.exc && doNA && match(NA, exclude, 
##         nomatch = 0L)) 
##         warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
##     args <- list(...)
##     if (length(args) == 1L && is.list(args[[1L]])) {
##         args <- args[[1L]]
##         if (length(dnn) != length(args)) 
##             dnn <- paste(dnn[1L], seq_along(args), sep = ".")
##     }
##     if (!length(args)) 
##         stop("nothing to tabulate")
##     bin <- 0L
##     lens <- NULL
##     dims <- integer()
##     pd <- 1L
##     dn <- NULL
##     for (a in args) {
##         if (is.null(lens)) 
##             lens <- length(a)
##         else if (length(a) != lens) 
##             stop("all arguments must have the same length")
##         fact.a <- is.factor(a)
##         if (doNA) 
##             aNA <- anyNA(a)
##         if (!fact.a) {
##             a0 <- a
##             op <- options(warn = 2)
##             on.exit(options(op))
##             a <- factor(a, exclude = exclude)
##             options(op)
##         }
##         add.na <- doNA
##         if (add.na) {
##             ifany <- (useNA == "ifany")
##             anNAc <- anyNA(a)
##             add.na <- if (!ifany || anNAc) {
##                 ll <- levels(a)
##                 if (add.ll <- !anyNA(ll)) {
##                   ll <- c(ll, NA)
##                   TRUE
##                 }
##                 else if (!ifany && !anNAc) 
##                   FALSE
##                 else TRUE
##             }
##             else FALSE
##         }
##         if (add.na) 
##             a <- factor(a, levels = ll, exclude = NULL)
##         else ll <- levels(a)
##         a <- as.integer(a)
##         if (fact.a && !miss.exc) {
##             ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) == 
##                 0L)]
##             a <- match(a, keep)
##         }
##         else if (!fact.a && add.na) {
##             if (ifany && !aNA && add.ll) {
##                 ll <- ll[!is.na(ll)]
##                 is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) > 
##                   0L
##             }
##             else {
##                 is.na(a) <- match(a0, exclude, nomatch = 0L) > 
##                   0L
##             }
##         }
##         nl <- length(ll)
##         dims <- c(dims, nl)
##         if (prod(dims) > .Machine$integer.max) 
##             stop("attempt to make a table with >= 2^31 elements")
##         dn <- c(dn, list(ll))
##         bin <- bin + pd * (a - 1L)
##         pd <- pd * nl
##     }
##     names(dn) <- dnn
##     bin <- bin[!is.na(bin)]
##     if (length(bin)) 
##         bin <- bin + 1L
##     y <- array(tabulate(bin, pd), dims, dimnames = dn)
##     class(y) <- "table"
##     y
## }
## <bytecode: 0x1309330d8>
## <environment: namespace:base>

cat("Naive Bayes accuracy on adult_num:", round(nb_accuracy_num * 100, 2), "%\n")

## Naive Bayes accuracy on adult_num: 79.37 %

cat("Confusion Matrix for adult_num:\n")

## Confusion Matrix for adult_num:

print(nb_confusion_num)

##          Actual
## Predicted <=50K >50K
##     <=50K  7067 1666
##     >50K    349  686

# Calculate additional metrics
precision_num <- nb_confusion_num[2, 2] / sum(nb_confusion_num[2, ])
recall_num <- nb_confusion_num[2, 2] / sum(nb_confusion_num[, 2])
f1_num <- 2 * (precision_num * recall_num) / (precision_num + recall_num)

cat("Precision:", round(precision_num, 3), "\n")

## Precision: 0.663

cat("Recall:", round(recall_num, 3), "\n")

## Recall: 0.292

cat("F1-Score:", round(f1_num, 3), "\n")

## F1-Score: 0.405

cat("\n🐱🐱🐱 Step 9: adult_num analysis completed 🐱🐱🐱\n\n")

## 
## 🐱🐱🐱 Step 9: adult_num analysis completed 🐱🐱🐱

# ==================== PART 5: COMPARISON AND RECOMMENDATIONS ====================

cat("🐱🐱🐱 FINAL COMPARISON AND RECOMMENDATIONS 🐱🐱🐱\n")

## 🐱🐱🐱 FINAL COMPARISON AND RECOMMENDATIONS 🐱🐱🐱

cat("ACCURACY COMPARISON:\n")

## ACCURACY COMPARISON:

cat("adult_cat (categorical features) accuracy:", round(nb_accuracy_cat * 100, 2), "%\n")

## adult_cat (categorical features) accuracy: 79.78 %

cat("adult_num (numerical features) accuracy:", round(nb_accuracy_num * 100, 2), "%\n") # nolint

## adult_num (numerical features) accuracy: 79.37 %

if (nb_accuracy_cat > nb_accuracy_num) {
  better_dataset <- "adult_cat (categorical features)"
  accuracy_diff <- nb_accuracy_cat - nb_accuracy_num
} else {
  better_dataset <- "adult_num (numerical features)"
  accuracy_diff <- nb_accuracy_num - nb_accuracy_cat
}

cat("\nBETTER PERFORMING DATASET:", better_dataset, "\n")

## 
## BETTER PERFORMING DATASET: adult_cat (categorical features)

cat("Accuracy difference:", round(accuracy_diff * 100, 2), "percentage points\n")

## Accuracy difference: 0.41 percentage points

cat("\nRECOMMENDATION REASONING:\n")

## 
## RECOMMENDATION REASONING:

cat("1. Accuracy: The", better_dataset, "achieved higher accuracy because of\n")

## 1. Accuracy: The adult_cat (categorical features) achieved higher accuracy because of

cat("2. Feature relevance: ")

## 2. Feature relevance:

if (nb_accuracy_cat > nb_accuracy_num) {
  cat("Categorical features like education, occupation, and marital status\n")
  cat("   are more informative for salary prediction than pure numerical features\n")
  cat("3. Naive Bayes assumptions: Work well with categorical data due to independence assumptions\n")
  cat("4. Interpretability: Categorical features provide more interpretable results\n")
} else {
  cat("Numerical features like age, hours worked, and capital gains/losses\n")
  cat("2. provide more actual predictive power for salary classification\n")
  cat("3. Quantitative relationships: Numerical data captures time-sensitive continuous relationships better\n")
  cat("4. Feature engineering: Numerical features can be edited to gain benefit from further transformation\n")
}

## Categorical features like education, occupation, and marital status
##    are more informative for salary prediction than pure numerical features
## 3. Naive Bayes assumptions: Work well with categorical data due to independence assumptions
## 4. Interpretability: Categorical features provide more interpretable results

cat("\nADDITIONAL CONSIDERATIONS & NOTES ON FURTHER WORK:\n")

## 
## ADDITIONAL CONSIDERATIONS & NOTES ON FURTHER WORK:

cat("- Combining BOTH categorical and numerical features might yield even better results\n")

## - Combining BOTH categorical and numerical features might yield even better results

cat("- Feature selection techniques could identify the most important predictors\n")

## - Feature selection techniques could identify the most important predictors

cat("- Other algorithms (Random Forest, Gradient Boosting) might perform MUCH better\n")

## - Other algorithms (Random Forest, Gradient Boosting) might perform MUCH better

cat("- Cross-validation would provide more robust accuracy estimates\n")

## - Cross-validation would provide more robust accuracy estimates

#
cat("\n🌲🌲🌲 (MY PICK) RANDOM FOREST: DISSECTIVE ANALYSIS OF THE MODEL PROCESS 🌲🌲🌲\n")

## 
## 🌲🌲🌲 (MY PICK) RANDOM FOREST: DISSECTIVE ANALYSIS OF THE MODEL PROCESS 🌲🌲🌲

cat("\n >>huh??? .. WHAT IS THIS RANDOM FOREST OF WHICH YOU SPEAK?.. ==>\n")

## 
##  >>huh??? .. WHAT IS THIS RANDOM FOREST OF WHICH YOU SPEAK?.. ==>

#
cat("Random Forest is an ensemble method that creates a 'forest' of decision trees\n")

## Random Forest is an ensemble method that creates a 'forest' of decision trees

cat("and combines their predictions for more accurate and robust results.\n")

## and combines their predictions for more accurate and robust results.

cat("\nABCD THE FOREST BUILDING PROCESS (Step-by-Step): DEFG\n")

## 
## ABCD THE FOREST BUILDING PROCESS (Step-by-Step): DEFG

cat("1. BOOTSTRAP SAMPLING: Create 100+ random samples of dataset (with replacement)\n")

## 1. BOOTSTRAP SAMPLING: Create 100+ random samples of dataset (with replacement)

cat("2. WHEEL OF FEATURE RANDOMNESS: For each tree, randomly select subset of features to consider\n")

## 2. WHEEL OF FEATURE RANDOMNESS: For each tree, randomly select subset of features to consider

cat("3. TREE GROWING: Build decision tree on each bootstrap sample starting from random features\n")

## 3. TREE GROWING: Build decision tree on each bootstrap sample starting from random features

cat("4. NO PRUNING: Let trees grow deep (unlike single trees that need pruning)\n")

## 4. NO PRUNING: Let trees grow deep (unlike single trees that need pruning)

cat("5. VOTING: For prediction, all trees 'vote' - majority wins (classification)\n")

## 5. VOTING: For prediction, all trees 'vote' - majority wins (classification)

#
cat("\n=== WHY THIS WORKS MUCH BETTER THAN NAIVE BAYES: ===\n")

## 
## === WHY THIS WORKS MUCH BETTER THAN NAIVE BAYES: ===

cat("- HANDLES MIXED DATA: Cats can be 'Male' AND earn 50K - no independence assumption\n")

## - HANDLES MIXED DATA: Cats can be 'Male' AND earn 50K - no independence assumption

cat("- CAPTURES INTERACTIONS: Can learn 'Masters + Private sector = Fat Cat'\n")

## - CAPTURES INTERACTIONS: Can learn 'Masters + Private sector = Fat Cat'

cat("- ROBUST TO OUTLIERS: One weird glandular-obese cat doesn't break the model\n")

## - ROBUST TO OUTLIERS: One weird glandular-obese cat doesn't break the model

cat("- FEATURE IMPORTANCE: Tells you which cat GROWTH characteristics matter most\n")

## - FEATURE IMPORTANCE: Tells you which cat GROWTH characteristics matter most

cat("- HIGHER ACCURACY: Typically 80-85% vs 78% from Naive Bayes\n")

## - HIGHER ACCURACY: Typically 80-85% vs 78% from Naive Bayes

cat("\n (draft) IMPLEMENTATION FOR OUR WORKING CATS: \n")

## 
##  (draft) IMPLEMENTATION FOR OUR WORKING CATS:

cat("library(randomForest)\n")

## library(randomForest)

cat("# Combine all cat features (no need to split categorical/numerical!)\n")

## # Combine all cat features (no need to split categorical/numerical!)

cat("adult_combined <- adult  # Use all features together\n")

## adult_combined <- adult  # Use all features together

cat("rf_model <- randomForest(Salary ~ ., data=adult_combined, ntree=500)\n")

## rf_model <- randomForest(Salary ~ ., data=adult_combined, ntree=500)

cat("print(rf_model)  # Shows Out-of-Bag error rate (built-in cross-validation!)\n")

## print(rf_model)  # Shows Out-of-Bag error rate (built-in cross-validation!)

cat("importance(rf_model)  # Which features best predict fat cat status?\n")

## importance(rf_model)  # Which features best predict fat cat status?

cat("varImpPlot(rf_model)  # Visual ranking of important cat characteristics\n")

## varImpPlot(rf_model)  # Visual ranking of important cat characteristics

cat("\n=== INTERPRETING RESULTS: ===\n")

## 
## === INTERPRETING RESULTS: ===

cat("- OOB Error Rate: Built-in accuracy estimate (no separate test set needed!)\n")

## - OOB Error Rate: Built-in accuracy estimate (no separate test set needed!)

cat("- Feature Importance: Higher values = better predictors of fat cat status\n")

## - Feature Importance: Higher values = better predictors of fat cat status

cat("- Example likely results: Education > Hours_per_Week > Age > Occupation\n")

## - Example likely results: Education > Hours_per_Week > Age > Occupation

cat("\n=== ADVANTAGES FOR WORKING CATS ANALYSIS: ===\n")

## 
## === ADVANTAGES FOR WORKING CATS ANALYSIS: ===

cat("1. NATURAL HANDLING: 'Married' cats vs 'Single' cats - no dummy variables!\n")

## 1. NATURAL HANDLING: 'Married' cats vs 'Single' cats - no dummy variables!

cat("2. MISSING DATA: If a cat's workclass is unknown, forest adapts automatically\n")

## 2. MISSING DATA: If a cat's workclass is unknown, forest adapts automatically

cat("3. NON-LINEAR: Can learn 'Young + Masters = Fat Cat' but 'Old + Masters = Not Fat Cat'\n")

## 3. NON-LINEAR: Can learn 'Young + Masters = Fat Cat' but 'Old + Masters = Not Fat Cat'

cat("4. CONFIDENCE: Can output probability of becoming fat cat (0-100%)\n")

## 4. CONFIDENCE: Can output probability of becoming fat cat (0-100%)

cat("5. EXPLANATION: Can trace decision path for any individual cat\n")

## 5. EXPLANATION: Can trace decision path for any individual cat

cat("\n=== HYPERPARAMETER TUNING FOR OPTIMAL FAT CAT PREDICTION: ===\n")

## 
## === HYPERPARAMETER TUNING FOR OPTIMAL FAT CAT PREDICTION: ===

cat("- ntree: More trees = better accuracy (try 500-1000)\n")

## - ntree: More trees = better accuracy (try 500-1000)

cat("- mtry: Features per tree (default: sqrt(total_features) works well)\n")

## - mtry: Features per tree (default: sqrt(total_features) works well)

cat("- nodesize: Minimum cats per leaf (smaller = more detailed rules)\n")

## - nodesize: Minimum cats per leaf (smaller = more detailed rules)

cat("- maxnodes: Max decision points per tree (avoid processor-burning complexity)\n")

## - maxnodes: Max decision points per tree (avoid processor-burning complexity)

cat("\n=== FUTURE AWESOME RESEARCH DIRECTIONS: ===\n")

## 
## === FUTURE AWESOME RESEARCH DIRECTIONS: ===

cat("- Gradient Boosting: Even more sophisticated ensemble method\n")

## - Gradient Boosting: Even more sophisticated ensemble method

cat("- XGBoost: Often wins machine learning competitions\n")

## - XGBoost: Often wins machine learning competitions

cat("- Deep Learning: Neural networks for understanding complex cat behavior patterns\n")

## - Deep Learning: Neural networks for understanding complex cat behavior patterns

cat("\n🐱🐱🐱 FAT CAT PREDICTION ANALYSIS COMPLETED 🐱🐱🐱\n")

## 
## 🐱🐱🐱 FAT CAT PREDICTION ANALYSIS COMPLETED 🐱🐱🐱

RUSSELL_DAS211_HW3.R.r

brian.russell

2025-10-31