# RUSSELL_DAS211_HW3.R
# Chapters 1-8-ish of Machine Learning for Data Scientists
# Date: October 30, 2025
#
# SMOOTH INSTALLATION FUNCTION - No more install prompts!
install_if_missing <- function(packages) {
for (pkg in packages) {
if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
cat("Installing missing package:", pkg, "\n")
install.packages(pkg, dependencies = TRUE, repos = "https://cran.rstudio.com/")
library(pkg, character.only = TRUE)
}
}
}
# Install required packages smoothly
required_packages <- c(
"tidyverse", "class", "caret", "vroom", "naivebayes",
"cluster", "nnet", "randomForest", "fontawesome"
)
cat("π§ Checking and installing required packages...\n")
## π§ Checking and installing required packages...
install_if_missing(required_packages)
## ββ Attaching core tidyverse packages ββββββββββββββββββββββββ tidyverse 2.0.0 ββ
## β dplyr 1.1.4 β readr 2.1.5
## β forcats 1.0.1 β stringr 1.5.2
## β ggplot2 4.0.0 β tibble 3.3.0
## β lubridate 1.9.4 β tidyr 1.3.1
## β purrr 1.1.0
## ββ Conflicts ββββββββββββββββββββββββββββββββββββββββββ tidyverse_conflicts() ββ
## β dplyr::filter() masks stats::filter()
## β dplyr::lag() masks stats::lag()
## βΉ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
##
## Attaching package: 'vroom'
##
##
## The following objects are masked from 'package:readr':
##
## as.col_spec, col_character, col_date, col_datetime, col_double,
## col_factor, col_guess, col_integer, col_logical, col_number,
## col_skip, col_time, cols, cols_condense, cols_only, date_names,
## date_names_lang, date_names_langs, default_locale, fwf_cols,
## fwf_empty, fwf_positions, fwf_widths, locale, output_column,
## problems, spec
##
##
## naivebayes 1.0.0 loaded
##
## For more information please visit:
##
## https://majkamichal.github.io/naivebayes/
##
## randomForest 4.7-1.2
##
## Type rfNews() to see new features/changes/bug fixes.
##
##
## Attaching package: 'randomForest'
##
##
## The following object is masked from 'package:dplyr':
##
## combine
##
##
## The following object is masked from 'package:ggplot2':
##
## margin
# NOTE FOR TEACHER: package e1071 is not available for the latest
# version of R. Updated R due to constant popup warnings, but now regret it.
# SOLUTION: Use naivebayes package as e1071 replacement for Naive Bayes functionality
# Libraries are already loaded by install_if_missing function above
cat("β
All packages loaded successfully!\n")
## β
All packages loaded successfully!
# Set working directory
setwd("/Users/brian.russell/Downloads/VSC 3")
# NOTE FOR TEACHER: package e1071 is not available for the latest
# version of R. Updated R due to constant popup warnings, but now regret it.
# TROUBLESHOOTING PROCESS:
# 1. Initially tried loading e1071 dependencies from CRAN source page
# 2. Hit parsing errors with long install.packages() lines - suspected line limits
# 3. Tried nolint commands - didn't work (that's for linters, not R parser)
# 4. Discovered the real issue: R session restart after package install
# was clearing variables, so moved library() calls before variable assignment
# 5. Fixed syntax error: library(pkg1, pkg2, ...) is invalid - each needs own library() call
# 6. SOLUTION: Use naivebayes package as e1071 replacement for Naive Bayes functionality
# pkgs <- c("cluster", "mlbench", "nnet", "randomForest", "rpart", "SparseM", "xtable", "Matrix", "MASS", "slam")
# install.packages(pkgs)
# Install naivebayes as e1071 replacement
# install.packages("naivebayes")
# Load libraries
library(class)
library(caret)
library(vroom)
library(naivebayes) # Replacement for e1071's naiveBayes function
# Load some e1071 dependencies that might be useful
library(cluster)
library(nnet)
library(randomForest)
library(fontawesome)
# Set working directory
setwd("~/Library/CloudStorage/GoogleDrive-brian.russell@gallaudet.edu/My Drive/Gallaudet classes fall 2025/Machine Learning/R working directory/Assignment 3")
# ==================== PART 1: GALLSTONES ANALYSIS ====================
# 1. Load gallstones data
cat("*** Loading gallstones dataset ***\n")
## *** Loading gallstones dataset ***
gallstones <- read_csv("gallstones.csv")
## Rows: 319 Columns: 39
## ββ Column specification ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## Delimiter: ","
## dbl (39): Gallstone_Status, Age, Gender, Comorbidity, Coronary_Artery_Diseas...
##
## βΉ Use `spec()` to retrieve the full column specification for this data.
## βΉ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Get a glimpse of the data
head(gallstones)
## # A tibble: 6 Γ 39
## Gallstone_Status Age Gender Comorbidity Coronary_Artery_Disease
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 50 0 0 0
## 2 0 47 0 1 0
## 3 0 61 0 0 0
## 4 0 41 0 0 0
## 5 0 42 0 0 0
## 6 0 96 0 0 0
## # βΉ 34 more variables: Hypothyroidism <dbl>, Hyperlipidemia <dbl>,
## # Diabetes_Mellitus <dbl>, Height <dbl>, Weight <dbl>, BMI <dbl>,
## # Total_Body_Water <dbl>, Extracellular_Water <dbl>,
## # Intracellular_Water <dbl>,
## # `Extracellular_Fluid _Total_Body_Water_ratio` <dbl>,
## # Total_Body_Fat_Ratio <dbl>, Lean_Mass <dbl>, Body_Protein_Content <dbl>,
## # Visceral_Fat_Rating <dbl>, Bone_Mass <dbl>, Muscle_Mass <dbl>, β¦
glimpse(gallstones)
## Rows: 319
## Columns: 39
## $ Gallstone_Status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Age <dbl> 50, 47, 61, 41, 42, 96, β¦
## $ Gender <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Comorbidity <dbl> 0, 1, 0, 0, 0, 0, 0, 0, β¦
## $ Coronary_Artery_Disease <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Hypothyroidism <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Hyperlipidemia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Diabetes_Mellitus <dbl> 0, 0, 0, 0, 0, 0, 0, 0, β¦
## $ Height <dbl> 185, 176, 171, 168, 178,β¦
## $ Weight <dbl> 92.8, 94.5, 91.1, 67.7, β¦
## $ BMI <dbl> 27.1, 30.5, 31.2, 24.0, β¦
## $ Total_Body_Water <dbl> 52.9, 43.1, 47.2, 41.4, β¦
## $ Extracellular_Water <dbl> 21.2, 19.5, 20.1, 17.0, β¦
## $ Intracellular_Water <dbl> 31.7, 23.6, 27.1, 24.4, β¦
## $ `Extracellular_Fluid _Total_Body_Water_ratio` <dbl> 40, 45, 43, 41, 39, 46, β¦
## $ Total_Body_Fat_Ratio <dbl> 19.2, 32.8, 27.3, 15.8, β¦
## $ Lean_Mass <dbl> 80.84, 67.20, 72.67, 84.β¦
## $ Body_Protein_Content <dbl> 18.88, 16.68, 16.35, 16.β¦
## $ Visceral_Fat_Rating <dbl> 9, 15, 15, 6, 8, 12, 3, β¦
## $ Bone_Mass <dbl> 3.7, 3.2, 3.3, 2.9, 3.5,β¦
## $ Muscle_Mass <dbl> 71.4, 60.3, 62.9, 54.1, β¦
## $ Obesity <dbl> 23.4, 38.8, 41.7, 9.0, 2β¦
## $ Total_Fat_Content <dbl> 17.8, 31.0, 24.9, 10.7, β¦
## $ Visceral_Fat_Area <dbl> 10.6, 18.4, 16.2, 6.5, 1β¦
## $ `Visceral_Muscle_Are _Kg` <dbl> 39.7, 32.7, 34.0, 29.2, β¦
## $ Hepatic_Fat_Accumulation <dbl> 0, 0, 0, 1, 2, 0, 0, 3, β¦
## $ Glucose <dbl> 102, 94, 103, 69, 109, 7β¦
## $ Total_Cholesterol <dbl> 250, 172, 179, 173, 205,β¦
## $ Low_Density_Lipoprotein <dbl> 175.0, 108.0, 124.0, 73.β¦
## $ High_Density_Lipoprotein <dbl> 40, 43, 43, 59, 30, 30, β¦
## $ Triglyceride <dbl> 134, 103, 69, 53, 326, 6β¦
## $ Aspartat_Aminotransferase <dbl> 20, 14, 18, 20, 27, 13, β¦
## $ Alanin_Aminotransferase <dbl> 22, 13, 14, 12, 54, 13, β¦
## $ Alkaline_Phosphatase <dbl> 87, 46, 66, 34, 71, 60, β¦
## $ Creatinine <dbl> 0.82, 0.87, 1.25, 1.02, β¦
## $ Glomerular_Filtration_Rate <dbl> 112.47, 107.10, 65.51, 9β¦
## $ C_Reactive_Protein <dbl> 0.00, 0.00, 0.00, 0.00, β¦
## $ Hemoglobin <dbl> 16.0, 14.4, 16.2, 15.4, β¦
## $ Vitamin_D <dbl> 33.00000, 25.00000, 30.2β¦
cat("\n*** Step 1: Dataset loaded! ***\n\n")
##
## *** Step 1: Dataset loaded! ***
# 2. Normalize data using min/max normalization
cat("*** Normalizing gallstones data using min/max method ***\n")
## *** Normalizing gallstones data using min/max method ***
# Simple normalization function
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
# Apply normalization to numeric columns (excluding target)
gallstones_normalized <- gallstones %>%
mutate_if(is.numeric, ~ ifelse(. == Gallstone_Status, ., normalize(.)))
# Show comparison
cat("Original summary (first few col):\n")
## Original summary (first few col):
summary(gallstones[2:6])
## Age Gender Comorbidity Coronary_Artery_Disease
## Min. :20.00 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:38.50 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :49.00 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :48.07 Mean :0.4922 Mean :0.3354 Mean :0.03762
## 3rd Qu.:56.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :96.00 Max. :1.0000 Max. :3.0000 Max. :1.00000
## Hypothyroidism
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.02821
## 3rd Qu.:0.00000
## Max. :1.00000
cat("Normalized summary (same col):\n")
## Normalized summary (same col):
summary(gallstones_normalized[2:6])
## Age Gender Comorbidity Coronary_Artery_Disease
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2434 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.3816 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.3693 Mean :0.4922 Mean :0.2142 Mean :0.03762
## 3rd Qu.:0.4737 3rd Qu.:1.0000 3rd Qu.:0.3333 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Hypothyroidism
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.02821
## 3rd Qu.:0.00000
## Max. :1.00000
cat("\n*** Step 2: Data norming completed ***\n\n")
##
## *** Step 2: Data norming completed ***
# 3. Check for categorical data and create dummy variables if needed
cat("*** Check cat data π±π±π±\n")
## *** Check cat data π±π±π±
# Check data types
str(gallstones_normalized)
## tibble [319 Γ 39] (S3: tbl_df/tbl/data.frame)
## $ Gallstone_Status : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : num [1:319] 0.395 0.355 0.539 0.276 0.289 ...
## $ Gender : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Comorbidity : num [1:319] 0 0.333 0 0 0 ...
## $ Coronary_Artery_Disease : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hypothyroidism : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyperlipidemia : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes_Mellitus : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Height : num [1:319] 0.87 0.674 0.565 0.5 0.717 ...
## $ Weight : num [1:319] 0.496 0.513 0.479 0.247 0.464 ...
## $ BMI : num [1:319] 0.3 0.406 0.427 0.204 0.337 ...
## $ Total_Body_Water : num [1:319] 0.75 0.566 0.643 0.534 0.722 ...
## $ Extracellular_Water : num [1:319] 0.649 0.559 0.59 0.426 0.585 ...
## $ Intracellular_Water : num [1:319] 0.413 0.226 0.307 0.245 0.406 ...
## $ Extracellular_Fluid _Total_Body_Water_ratio: num [1:319] 0.473 0.693 0.605 0.517 0.429 ...
## $ Total_Body_Fat_Ratio : num [1:319] 0.289 0.594 0.471 0.213 0.307 ...
## $ Lean_Mass : num [1:319] 0.713 0.408 0.53 0.788 0.694 ...
## $ Body_Protein_Content : num [1:319] 0.692 0.578 0.561 0.589 0.584 ...
## $ Visceral_Fat_Rating : num [1:319] 0.267 0.467 0.467 0.167 0.233 ...
## $ Bone_Mass : num [1:319] 0.885 0.692 0.731 0.577 0.808 ...
## $ Muscle_Mass : num [1:319] 0.9 0.75 0.785 0.667 0.857 ...
## $ Obesity : num [1:319] 0.0118 0.0197 0.0211 0.0044 0.0144 ...
## $ Total_Fat_Content : num [1:319] 0.247 0.47 0.367 0.128 0.249 ...
## $ Visceral_Fat_Area : num [1:319] 0.242 0.436 0.382 0.14 0.237 ...
## $ Visceral_Muscle_Are _Kg : num [1:319] 0.937 0.622 0.68 0.464 0.833 ...
## $ Hepatic_Fat_Accumulation : num [1:319] 0 0 0 0.25 0.5 0 0 0.75 0.5 0 ...
## $ Glucose : num [1:319] 0.0652 0.0494 0.0672 0 0.0791 ...
## $ Total_Cholesterol : num [1:319] 0.633 0.373 0.397 0.377 0.483 ...
## $ Low_Density_Lipoprotein : num [1:319] 0.582 0.344 0.401 0.22 0.507 ...
## $ High_Density_Lipoprotein : num [1:319] 0.0605 0.0726 0.0726 0.1371 0.0202 ...
## $ Triglyceride : num [1:319] 0.1585 0.1215 0.0808 0.0617 0.388 ...
## $ Aspartat_Aminotransferase : num [1:319] 0.0642 0.0321 0.0535 0.0642 0.1016 ...
## $ Alanin_Aminotransferase : num [1:319] 0.0515 0.0271 0.0298 0.0244 0.1382 ...
## $ Alkaline_Phosphatase : num [1:319] 0.421 0.205 0.311 0.142 0.337 ...
## $ Creatinine : num [1:319] 0.36 0.41 0.79 0.56 0.36 1 0.31 0.84 0.45 0.45 ...
## $ Glomerular_Filtration_Rate : num [1:319] 0.839 0.795 0.452 0.688 0.839 ...
## $ C_Reactive_Protein : num [1:319] 0 0 0 0 0 ...
## $ Hemoglobin : num [1:319] 0.728 0.573 0.748 0.67 0.806 ...
## $ Vitamin_D : num [1:319] 0.595 0.433 0.538 0.643 0.748 ...
# All data appears to be numeric, so no dummy variables needed
gallstones_processed <- gallstones_normalized
cat("No stray cats found - proceeding with normalized data\n")
## No stray cats found - proceeding with normalized data
cat("Dataset dimensions:", dim(gallstones_processed), "\n")
## Dataset dimensions: 319 39
cat("\n*** Step 3: Data preparation completed ***\n\n")
##
## *** Step 3: Data preparation completed ***
# 4. Create training and test datasets (following Ch7 approach)
cat("*** Creating training and test datasets for gallstones ***\n")
## *** Creating training and test datasets for gallstones ***
# Convert target to factor
gallstones_processed <- gallstones_processed %>%
mutate(Gallstone_Status = as.factor(Gallstone_Status))
# Using sample() function like in Chapter 7
set.seed(1234)
sample_set <- sample(nrow(gallstones_processed), round(nrow(gallstones_processed) * 0.75), replace = FALSE)
gallstones_train <- gallstones_processed[sample_set, ]
gallstones_test <- gallstones_processed[-sample_set, ]
cat("Training set dimensions:", dim(gallstones_train), "\n")
## Training set dimensions: 239 39
cat("Test set dimensions:", dim(gallstones_test), "\n")
## Test set dimensions: 80 39
# Check class proportions
round(prop.table(table(gallstones_processed$Gallstone_Status)), 2)
##
## 0 1
## 0.5 0.5
round(prop.table(table(gallstones_train$Gallstone_Status)), 2)
##
## 0 1
## 0.52 0.48
round(prop.table(table(gallstones_test$Gallstone_Status)), 2)
##
## 0 1
## 0.45 0.55
# 5. Run K-nearest neighbors model (simplified)
cat("*** Running K-nearest neighbors model ***\n")
## *** Running K-nearest neighbors model ***
# Prepare data for KNN
train_features <- gallstones_train[, -1] # Remove target column
train_labels <- gallstones_train$Gallstone_Status # Target column as vector
test_features <- gallstones_test[, -1]
test_labels <- gallstones_test$Gallstone_Status # Target column as vector
# Run KNN with k=5
knn_pred <- knn(train = train_features, test = test_features, cl = train_labels, k = 5)
# Create confusion matrix
knn_table <- table(test_labels, knn_pred)
print(knn_table)
## knn_pred
## test_labels 0 1
## 0 32 4
## 1 18 26
# Calculate accuracy
knn_accuracy <- sum(diag(knn_table)) / nrow(gallstones_test)
cat("KNN Accuracy:", round(knn_accuracy * 100, 2), "%\n")
## KNN Accuracy: 72.5 %
cat("\n*** Steps 4 & 5: KNN model completed ***\n\n")
##
## *** Steps 4 & 5: KNN model completed ***
# Model improvement suggestions (simplified)
cat("*** Model Evaluation and Improvement Suggestions ***\n")
## *** Model Evaluation and Improvement Suggestions ***
cat("KNN Accuracy:", round(knn_accuracy * 100, 2), "%\n")
## KNN Accuracy: 72.5 %
cat("\nWays to improve model:\n")
##
## Ways to improve model:
cat("1. Try different k values\n")
## 1. Try different k values
cat("2. Feature selection\n")
## 2. Feature selection
cat("3. Different algorithms\n")
## 3. Different algorithms
cat("4. Cross-validation\n")
## 4. Cross-validation
cat("\n*** Gallstones analysis completed ***\n\n")
##
## *** Gallstones analysis completed ***
# ==================== PART 2: ADULT WORKING CATS DATASET ANALYSIS ====================
# NARRATIVE: Analyzing a dataset of adult working cats in search of predictive logic
# to determine which hardworking felines will eventually become the chonky "fat cats"
# (high earners >50K) vs those who remain regular working cats (β€50K)
# 6. Load adult working cats dataset
cat("π±π±π± Loading adult working cats dataset π±π±π±\n")
## π±π±π± Loading adult working cats dataset π±π±π±
# Load the data
adult <- read_csv("adult.csv")
## Rows: 32561 Columns: 15
## ββ Column specification ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## Delimiter: ","
## chr (9): Workclass, Education, Marital_Status, Occupation, Relationship, Rac...
## dbl (6): Age, ID, Education_Code, Capital_Gain, Capital_Loss, Hours_per_Week
##
## βΉ Use `spec()` to retrieve the full column specification for this data.
## βΉ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Clean the cat box (remove leading/trailing whitespace \n")
## Clean the cat box (remove leading/trailing whitespace
adult <- adult %>%
mutate_if(is.character, str_trim)
cat("Adult dataset dimensions:", dim(adult), "\n")
## Adult dataset dimensions: 32561 15
cat("Data structure:\n")
## Data structure:
str(adult)
## tibble [32,561 Γ 15] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:32561] 39 50 38 53 28 37 49 52 31 42 ...
## $ Workclass : chr [1:32561] "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ ID : num [1:32561] 77516 83311 215646 234721 338409 ...
## $ Education : chr [1:32561] "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ Education_Code: num [1:32561] 13 13 9 7 13 14 5 9 14 13 ...
## $ Marital_Status: chr [1:32561] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ Occupation : chr [1:32561] "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ Relationship : chr [1:32561] "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ Race : chr [1:32561] "White" "White" "White" "Black" ...
## $ Gender : chr [1:32561] "Male" "Male" "Male" "Male" ...
## $ Capital_Gain : num [1:32561] 2174 0 0 0 0 ...
## $ Capital_Loss : num [1:32561] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hours_per_Week: num [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
## $ Native_Country: chr [1:32561] "United-States" "United-States" "United-States" "United-States" ...
## $ Salary : chr [1:32561] "<=50K" "<=50K" "<=50K" "<=50K" ...
cat("First few rows:\n")
## First few rows:
print(head(adult))
## # A tibble: 6 Γ 15
## Age Workclass ID Education Education_Code Marital_Status Occupation
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 39 State-gov 77516 Bachelors 13 Never-married Adm-cleriβ¦
## 2 50 Self-emp-not-β¦ 83311 Bachelors 13 Married-civ-sβ¦ Exec-manaβ¦
## 3 38 Private 215646 HS-grad 9 Divorced Handlers-β¦
## 4 53 Private 234721 11th 7 Married-civ-sβ¦ Handlers-β¦
## 5 28 Private 338409 Bachelors 13 Married-civ-sβ¦ Prof-specβ¦
## 6 37 Private 284582 Masters 14 Married-civ-sβ¦ Exec-manaβ¦
## # βΉ 8 more variables: Relationship <chr>, Race <chr>, Gender <chr>,
## # Capital_Gain <dbl>, Capital_Loss <dbl>, Hours_per_Week <dbl>,
## # Native_Country <chr>, Salary <chr>
cat("\nπ±π±π± Step 6: meow. Adult dataset loaded successfully π±π±π±\n\n")
##
## π±π±π± Step 6: meow. Adult dataset loaded successfully π±π±π±
# 7. Divide adult kitty dataset into categorical and numerical subsets
cat("π±π±π± Dividing working cats into categorical traits vs numerical metrics π±π±π±\n")
## π±π±π± Dividing working cats into categorical traits vs numerical metrics π±π±π±
# Identify categorical and numerical columns
numerical_adult_cols <- c("Age", "ID", "Education_Code", "Capital_Gain", "Capital_Loss", "Hours_per_Week")
categorical_adult_cols <- c(
"Workclass", "Education", "Marital_Status", "Occupation",
"Relationship", "Race", "Gender", "Native_Country"
)
# Create adult_cat: categorical features + Salary
adult_cat <- adult %>%
select(all_of(categorical_adult_cols), Salary)
# Create adult_num: numerical features + Salary
adult_num <- adult %>%
select(all_of(numerical_adult_cols), Salary)
cat("adult_cat dimensions:", dim(adult_cat), "\n")
## adult_cat dimensions: 32561 9
cat("adult_cat columns:", colnames(adult_cat), "\n")
## adult_cat columns: Workclass Education Marital_Status Occupation Relationship Race Gender Native_Country Salary
cat("adult_num dimensions:", dim(adult_num), "\n")
## adult_num dimensions: 32561 7
cat("adult_num columns:", colnames(adult_num), "\n")
## adult_num columns: Age ID Education_Code Capital_Gain Capital_Loss Hours_per_Week Salary
cat("\nπ±π±π± Step 7: Herd of Adult Cats has been divided successfully π±π±π±\n\n")
##
## π±π±π± Step 7: Herd of Adult Cats has been divided successfully π±π±π±
# ==================== PART 3: ADULT_CAT ANALYSIS ====================
cat("π±π±π± Analyzing categorical cat traits to predict fat cat status: Who's a chonky boy, eh? π±π±π±\n")
## π±π±π± Analyzing categorical cat traits to predict fat cat status: Who's a chonky boy, eh? π±π±π±
# 8a. Divide into training and test datasets
set.seed(1234)
adult_cat_train_index <- createDataPartition(adult_cat$Salary, p = 0.7, list = FALSE)
adult_cat_train <- adult_cat[adult_cat_train_index, ]
adult_cat_test <- adult_cat[-adult_cat_train_index, ]
cat("adult_cat training set dimensions:", dim(adult_cat_train), "\n")
## adult_cat training set dimensions: 22793 9
cat("adult_cat test set dimensions:", dim(adult_cat_test), "\n")
## adult_cat test set dimensions: 9768 9
# 8b. Check for class imbalance
cat("\nClass distribution in adult_cat training data:\n")
##
## Class distribution in adult_cat training data:
salary_dist_train <- table(adult_cat_train$Salary)
print(salary_dist_train)
##
## <=50K >50K
## 17304 5489
cat("Percentage distribution:\n")
## Percentage distribution:
print(round(prop.table(salary_dist_train) * 100, 2))
##
## <=50K >50K
## 75.92 24.08
cat("\nClass distribution in adult_cat test data:\n")
##
## Class distribution in adult_cat test data:
salary_dist_test <- table(adult_cat_test$Salary)
print(salary_dist_test)
##
## <=50K >50K
## 7416 2352
cat("Percentage distribution:\n")
## Percentage distribution:
print(round(prop.table(salary_dist_test) * 100, 2))
##
## <=50K >50K
## 75.92 24.08
# Check if data is not balanced (minority class < 20%)
minority_percent <- min(prop.table(salary_dist_train)) * 100
cat("\nMinority class percentage:", round(minority_percent, 2), "%\n")
##
## Minority class percentage: 24.08 %
if (minority_percent < 20) {
cat("Dataset shows class imbalance. SMOTE them or use class weights.\n")
} else {
cat("Dataset appears reasonably balanced. Ok to proceed!\n")
}
## Dataset appears reasonably balanced. Ok to proceed!
# 8c. Run Naive Bayes classifier
cat("\nπ±π±π± Running Naive Bayes model on adult_cat dataset π±π±π±\n")
##
## π±π±π± Running Naive Bayes model on adult_cat dataset π±π±π±
# Convert character columns to factors for naive bayes
# IMPORTANT: Ensure train and test have same factor levels to avoid prediction errors
adult_cat_train_nb <- adult_cat_train %>%
mutate_if(is.character, as.factor)
adult_cat_test_nb <- adult_cat_test %>%
mutate_if(is.character, as.factor)
# Align factor levels between training and test sets
for (col in names(adult_cat_train_nb)) {
if (is.factor(adult_cat_train_nb[[col]])) {
# Get all unique levels from both train and test
all_levels <- unique(c(
levels(adult_cat_train_nb[[col]]),
levels(adult_cat_test_nb[[col]])
))
# Set same levels for both datasets
adult_cat_train_nb[[col]] <- factor(adult_cat_train_nb[[col]], levels = all_levels)
adult_cat_test_nb[[col]] <- factor(adult_cat_test_nb[[col]], levels = all_levels)
}
}
# Train Naive Bayes model (using naivebayes package instead of e1071)
# LAPLACE SMOOTHING CHOICE: Adding laplace=1 to handle zero probabilities
# Why Laplace instead of converting categorical to numeric ranges:
# 1. Preserves categorical meaning - "Masters" vs "HS-grad" aren't ordinal numbers
# 2. Handles rare combinations / outliers gracefully - adds small probability to unseen cases
# 3. Consistent interpretability - results still meaningful in original categories
# 4. Standard practice in NB - well-tested solution for this exact problem
# 5. Avoids arbitrary ordering - no need to assign values to "Private"=30 vs "Government"=60
nb_model_cat <- naive_bayes(Salary ~ ., data = adult_cat_train_nb, laplace = 1)
# 8d. Use model on test dataset
nb_pred_cat <- predict(nb_model_cat, adult_cat_test_nb[, -ncol(adult_cat_test_nb)])
# 8e. Calculate accuracy
nb_accuracy_cat <- sum(nb_pred_cat == adult_cat_test_nb$Salary) / nrow(adult_cat_test_nb)
nb_confusion_cat <- table(Predicted = nb_pred_cat, Actual = adult_cat_test_nb$Salary)
print(table) # because I want to know what the heck its doing
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no",
## "ifany", "always"), dnn = list.names(...), deparse.level = 1)
## {
## list.names <- function(...) {
## l <- as.list(substitute(list(...)))[-1L]
## if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1)))
## return(nm)
## nm <- names(l)
## fixup <- if (is.null(nm))
## seq_along(l)
## else nm == ""
## dep <- vapply(l[fixup], function(x) switch(deparse.level +
## 1, "", if (is.symbol(x)) as.character(x) else "",
## deparse(x, nlines = 1)[1L]), "")
## if (is.null(nm))
## dep
## else {
## nm[fixup] <- dep
## nm
## }
## }
## miss.use <- missing(useNA)
## miss.exc <- missing(exclude)
## useNA <- if (miss.use && !miss.exc && !match(NA, exclude,
## nomatch = 0L))
## "ifany"
## else match.arg(useNA)
## doNA <- useNA != "no"
## if (!miss.use && !miss.exc && doNA && match(NA, exclude,
## nomatch = 0L))
## warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
## args <- list(...)
## if (length(args) == 1L && is.list(args[[1L]])) {
## args <- args[[1L]]
## if (length(dnn) != length(args))
## dnn <- paste(dnn[1L], seq_along(args), sep = ".")
## }
## if (!length(args))
## stop("nothing to tabulate")
## bin <- 0L
## lens <- NULL
## dims <- integer()
## pd <- 1L
## dn <- NULL
## for (a in args) {
## if (is.null(lens))
## lens <- length(a)
## else if (length(a) != lens)
## stop("all arguments must have the same length")
## fact.a <- is.factor(a)
## if (doNA)
## aNA <- anyNA(a)
## if (!fact.a) {
## a0 <- a
## op <- options(warn = 2)
## on.exit(options(op))
## a <- factor(a, exclude = exclude)
## options(op)
## }
## add.na <- doNA
## if (add.na) {
## ifany <- (useNA == "ifany")
## anNAc <- anyNA(a)
## add.na <- if (!ifany || anNAc) {
## ll <- levels(a)
## if (add.ll <- !anyNA(ll)) {
## ll <- c(ll, NA)
## TRUE
## }
## else if (!ifany && !anNAc)
## FALSE
## else TRUE
## }
## else FALSE
## }
## if (add.na)
## a <- factor(a, levels = ll, exclude = NULL)
## else ll <- levels(a)
## a <- as.integer(a)
## if (fact.a && !miss.exc) {
## ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) ==
## 0L)]
## a <- match(a, keep)
## }
## else if (!fact.a && add.na) {
## if (ifany && !aNA && add.ll) {
## ll <- ll[!is.na(ll)]
## is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) >
## 0L
## }
## else {
## is.na(a) <- match(a0, exclude, nomatch = 0L) >
## 0L
## }
## }
## nl <- length(ll)
## dims <- c(dims, nl)
## if (prod(dims) > .Machine$integer.max)
## stop("attempt to make a table with >= 2^31 elements")
## dn <- c(dn, list(ll))
## bin <- bin + pd * (a - 1L)
## pd <- pd * nl
## }
## names(dn) <- dnn
## bin <- bin[!is.na(bin)]
## if (length(bin))
## bin <- bin + 1L
## y <- array(tabulate(bin, pd), dims, dimnames = dn)
## class(y) <- "table"
## y
## }
## <bytecode: 0x1309330d8>
## <environment: namespace:base>
cat("Naive Bayes accuracy on adult_cat:", round(nb_accuracy_cat * 100, 2), "%\n")
## Naive Bayes accuracy on adult_cat: 79.78 %
cat("Confusion Matrix for adult_cat:\n")
## Confusion Matrix for adult_cat:
print(nb_confusion_cat)
## Actual
## Predicted <=50K >50K
## <=50K 6038 597
## >50K 1378 1755
# Calculate additional metrics
precision_cat <- nb_confusion_cat[2, 2] / sum(nb_confusion_cat[2, ])
recall_cat <- nb_confusion_cat[2, 2] / sum(nb_confusion_cat[, 2])
f1_cat <- 2 * (precision_cat * recall_cat) / (precision_cat + recall_cat)
cat("Precision:", round(precision_cat, 3), "\n")
## Precision: 0.56
cat("Recall:", round(recall_cat, 3), "\n")
## Recall: 0.746
cat("F1-Score:", round(f1_cat, 3), "\n")
## F1-Score: 0.64
cat("\nπ±π±π± Step 8: adult_cat analysis completed π±π±π±\n\n")
##
## π±π±π± Step 8: adult_cat analysis completed π±π±π±
# ==================== PART 4: ADULT_NUM ANALYSIS ====================
cat("π±π±π± Analyzing numerical cat metrics to predict eventual fat cat status π±π±π±\n")
## π±π±π± Analyzing numerical cat metrics to predict eventual fat cat status π±π±π±
# 9. Repeat all steps for adult_num dataset
# 9a. Divide into training and test datasets, withholding SALARY
adult_num_train_index <- createDataPartition(adult_num$Salary, p = 0.7, list = FALSE)
adult_num_train <- adult_num[adult_num_train_index, ]
adult_num_test <- adult_num[-adult_num_train_index, ]
cat("adult_num training set dimensions:", dim(adult_num_train), "\n")
## adult_num training set dimensions: 22793 7
cat("adult_num test set dimensions:", dim(adult_num_test), "\n")
## adult_num test set dimensions: 9768 7
# 9b. Check for class imbalance / social injustice
cat("\nClass distribution in adult_num training data:\n")
##
## Class distribution in adult_num training data:
salary_dist_num_train <- table(adult_num_train$Salary)
print(salary_dist_num_train)
##
## <=50K >50K
## 17304 5489
cat("Percentage distribution:\n")
## Percentage distribution:
print(round(prop.table(salary_dist_num_train) * 100, 2))
##
## <=50K >50K
## 75.92 24.08
# 9c. Run Naive Bayes classifier on numerical data
cat("\nπ±π±π± Running Naive Bayes on adult_num π±π±π±\n")
##
## π±π±π± Running Naive Bayes on adult_num π±π±π±
# Convert Salary to factor
adult_num_train$Salary <- as.factor(adult_num_train$Salary)
adult_num_test$Salary <- as.factor(adult_num_test$Salary)
# Train Naive Bayes model (using naivebayes package instead of e1071)
# Adding Laplace smoothing for numerical features too
nb_model_num <- naive_bayes(Salary ~ ., data = adult_num_train, laplace = 1)
# 9d. Use model on test dataset
nb_pred_num <- predict(nb_model_num, adult_num_test[, -ncol(adult_num_test)])
# 9e. Calculate accuracy
nb_accuracy_num <- sum(nb_pred_num == adult_num_test$Salary) / nrow(adult_num_test)
nb_confusion_num <- table(Predicted = nb_pred_num, Actual = adult_num_test$Salary)
print(table) # because its interesting, thats why
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no",
## "ifany", "always"), dnn = list.names(...), deparse.level = 1)
## {
## list.names <- function(...) {
## l <- as.list(substitute(list(...)))[-1L]
## if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1)))
## return(nm)
## nm <- names(l)
## fixup <- if (is.null(nm))
## seq_along(l)
## else nm == ""
## dep <- vapply(l[fixup], function(x) switch(deparse.level +
## 1, "", if (is.symbol(x)) as.character(x) else "",
## deparse(x, nlines = 1)[1L]), "")
## if (is.null(nm))
## dep
## else {
## nm[fixup] <- dep
## nm
## }
## }
## miss.use <- missing(useNA)
## miss.exc <- missing(exclude)
## useNA <- if (miss.use && !miss.exc && !match(NA, exclude,
## nomatch = 0L))
## "ifany"
## else match.arg(useNA)
## doNA <- useNA != "no"
## if (!miss.use && !miss.exc && doNA && match(NA, exclude,
## nomatch = 0L))
## warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
## args <- list(...)
## if (length(args) == 1L && is.list(args[[1L]])) {
## args <- args[[1L]]
## if (length(dnn) != length(args))
## dnn <- paste(dnn[1L], seq_along(args), sep = ".")
## }
## if (!length(args))
## stop("nothing to tabulate")
## bin <- 0L
## lens <- NULL
## dims <- integer()
## pd <- 1L
## dn <- NULL
## for (a in args) {
## if (is.null(lens))
## lens <- length(a)
## else if (length(a) != lens)
## stop("all arguments must have the same length")
## fact.a <- is.factor(a)
## if (doNA)
## aNA <- anyNA(a)
## if (!fact.a) {
## a0 <- a
## op <- options(warn = 2)
## on.exit(options(op))
## a <- factor(a, exclude = exclude)
## options(op)
## }
## add.na <- doNA
## if (add.na) {
## ifany <- (useNA == "ifany")
## anNAc <- anyNA(a)
## add.na <- if (!ifany || anNAc) {
## ll <- levels(a)
## if (add.ll <- !anyNA(ll)) {
## ll <- c(ll, NA)
## TRUE
## }
## else if (!ifany && !anNAc)
## FALSE
## else TRUE
## }
## else FALSE
## }
## if (add.na)
## a <- factor(a, levels = ll, exclude = NULL)
## else ll <- levels(a)
## a <- as.integer(a)
## if (fact.a && !miss.exc) {
## ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) ==
## 0L)]
## a <- match(a, keep)
## }
## else if (!fact.a && add.na) {
## if (ifany && !aNA && add.ll) {
## ll <- ll[!is.na(ll)]
## is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) >
## 0L
## }
## else {
## is.na(a) <- match(a0, exclude, nomatch = 0L) >
## 0L
## }
## }
## nl <- length(ll)
## dims <- c(dims, nl)
## if (prod(dims) > .Machine$integer.max)
## stop("attempt to make a table with >= 2^31 elements")
## dn <- c(dn, list(ll))
## bin <- bin + pd * (a - 1L)
## pd <- pd * nl
## }
## names(dn) <- dnn
## bin <- bin[!is.na(bin)]
## if (length(bin))
## bin <- bin + 1L
## y <- array(tabulate(bin, pd), dims, dimnames = dn)
## class(y) <- "table"
## y
## }
## <bytecode: 0x1309330d8>
## <environment: namespace:base>
cat("Naive Bayes accuracy on adult_num:", round(nb_accuracy_num * 100, 2), "%\n")
## Naive Bayes accuracy on adult_num: 79.37 %
cat("Confusion Matrix for adult_num:\n")
## Confusion Matrix for adult_num:
print(nb_confusion_num)
## Actual
## Predicted <=50K >50K
## <=50K 7067 1666
## >50K 349 686
# Calculate additional metrics
precision_num <- nb_confusion_num[2, 2] / sum(nb_confusion_num[2, ])
recall_num <- nb_confusion_num[2, 2] / sum(nb_confusion_num[, 2])
f1_num <- 2 * (precision_num * recall_num) / (precision_num + recall_num)
cat("Precision:", round(precision_num, 3), "\n")
## Precision: 0.663
cat("Recall:", round(recall_num, 3), "\n")
## Recall: 0.292
cat("F1-Score:", round(f1_num, 3), "\n")
## F1-Score: 0.405
cat("\nπ±π±π± Step 9: adult_num analysis completed π±π±π±\n\n")
##
## π±π±π± Step 9: adult_num analysis completed π±π±π±
# ==================== PART 5: COMPARISON AND RECOMMENDATIONS ====================
cat("π±π±π± FINAL COMPARISON AND RECOMMENDATIONS π±π±π±\n")
## π±π±π± FINAL COMPARISON AND RECOMMENDATIONS π±π±π±
cat("ACCURACY COMPARISON:\n")
## ACCURACY COMPARISON:
cat("adult_cat (categorical features) accuracy:", round(nb_accuracy_cat * 100, 2), "%\n")
## adult_cat (categorical features) accuracy: 79.78 %
cat("adult_num (numerical features) accuracy:", round(nb_accuracy_num * 100, 2), "%\n") # nolint
## adult_num (numerical features) accuracy: 79.37 %
if (nb_accuracy_cat > nb_accuracy_num) {
better_dataset <- "adult_cat (categorical features)"
accuracy_diff <- nb_accuracy_cat - nb_accuracy_num
} else {
better_dataset <- "adult_num (numerical features)"
accuracy_diff <- nb_accuracy_num - nb_accuracy_cat
}
cat("\nBETTER PERFORMING DATASET:", better_dataset, "\n")
##
## BETTER PERFORMING DATASET: adult_cat (categorical features)
cat("Accuracy difference:", round(accuracy_diff * 100, 2), "percentage points\n")
## Accuracy difference: 0.41 percentage points
cat("\nRECOMMENDATION REASONING:\n")
##
## RECOMMENDATION REASONING:
cat("1. Accuracy: The", better_dataset, "achieved higher accuracy because of\n")
## 1. Accuracy: The adult_cat (categorical features) achieved higher accuracy because of
cat("2. Feature relevance: ")
## 2. Feature relevance:
if (nb_accuracy_cat > nb_accuracy_num) {
cat("Categorical features like education, occupation, and marital status\n")
cat(" are more informative for salary prediction than pure numerical features\n")
cat("3. Naive Bayes assumptions: Work well with categorical data due to independence assumptions\n")
cat("4. Interpretability: Categorical features provide more interpretable results\n")
} else {
cat("Numerical features like age, hours worked, and capital gains/losses\n")
cat("2. provide more actual predictive power for salary classification\n")
cat("3. Quantitative relationships: Numerical data captures time-sensitive continuous relationships better\n")
cat("4. Feature engineering: Numerical features can be edited to gain benefit from further transformation\n")
}
## Categorical features like education, occupation, and marital status
## are more informative for salary prediction than pure numerical features
## 3. Naive Bayes assumptions: Work well with categorical data due to independence assumptions
## 4. Interpretability: Categorical features provide more interpretable results
cat("\nADDITIONAL CONSIDERATIONS & NOTES ON FURTHER WORK:\n")
##
## ADDITIONAL CONSIDERATIONS & NOTES ON FURTHER WORK:
cat("- Combining BOTH categorical and numerical features might yield even better results\n")
## - Combining BOTH categorical and numerical features might yield even better results
cat("- Feature selection techniques could identify the most important predictors\n")
## - Feature selection techniques could identify the most important predictors
cat("- Other algorithms (Random Forest, Gradient Boosting) might perform MUCH better\n")
## - Other algorithms (Random Forest, Gradient Boosting) might perform MUCH better
cat("- Cross-validation would provide more robust accuracy estimates\n")
## - Cross-validation would provide more robust accuracy estimates
#
cat("\nπ²π²π² (MY PICK) RANDOM FOREST: DISSECTIVE ANALYSIS OF THE MODEL PROCESS π²π²π²\n")
##
## π²π²π² (MY PICK) RANDOM FOREST: DISSECTIVE ANALYSIS OF THE MODEL PROCESS π²π²π²
cat("\n >>huh??? .. WHAT IS THIS RANDOM FOREST OF WHICH YOU SPEAK?.. ==>\n")
##
## >>huh??? .. WHAT IS THIS RANDOM FOREST OF WHICH YOU SPEAK?.. ==>
#
cat("Random Forest is an ensemble method that creates a 'forest' of decision trees\n")
## Random Forest is an ensemble method that creates a 'forest' of decision trees
cat("and combines their predictions for more accurate and robust results.\n")
## and combines their predictions for more accurate and robust results.
cat("\nABCD THE FOREST BUILDING PROCESS (Step-by-Step): DEFG\n")
##
## ABCD THE FOREST BUILDING PROCESS (Step-by-Step): DEFG
cat("1. BOOTSTRAP SAMPLING: Create 100+ random samples of dataset (with replacement)\n")
## 1. BOOTSTRAP SAMPLING: Create 100+ random samples of dataset (with replacement)
cat("2. WHEEL OF FEATURE RANDOMNESS: For each tree, randomly select subset of features to consider\n")
## 2. WHEEL OF FEATURE RANDOMNESS: For each tree, randomly select subset of features to consider
cat("3. TREE GROWING: Build decision tree on each bootstrap sample starting from random features\n")
## 3. TREE GROWING: Build decision tree on each bootstrap sample starting from random features
cat("4. NO PRUNING: Let trees grow deep (unlike single trees that need pruning)\n")
## 4. NO PRUNING: Let trees grow deep (unlike single trees that need pruning)
cat("5. VOTING: For prediction, all trees 'vote' - majority wins (classification)\n")
## 5. VOTING: For prediction, all trees 'vote' - majority wins (classification)
#
cat("\n=== WHY THIS WORKS MUCH BETTER THAN NAIVE BAYES: ===\n")
##
## === WHY THIS WORKS MUCH BETTER THAN NAIVE BAYES: ===
cat("- HANDLES MIXED DATA: Cats can be 'Male' AND earn 50K - no independence assumption\n")
## - HANDLES MIXED DATA: Cats can be 'Male' AND earn 50K - no independence assumption
cat("- CAPTURES INTERACTIONS: Can learn 'Masters + Private sector = Fat Cat'\n")
## - CAPTURES INTERACTIONS: Can learn 'Masters + Private sector = Fat Cat'
cat("- ROBUST TO OUTLIERS: One weird glandular-obese cat doesn't break the model\n")
## - ROBUST TO OUTLIERS: One weird glandular-obese cat doesn't break the model
cat("- FEATURE IMPORTANCE: Tells you which cat GROWTH characteristics matter most\n")
## - FEATURE IMPORTANCE: Tells you which cat GROWTH characteristics matter most
cat("- HIGHER ACCURACY: Typically 80-85% vs 78% from Naive Bayes\n")
## - HIGHER ACCURACY: Typically 80-85% vs 78% from Naive Bayes
cat("\n (draft) IMPLEMENTATION FOR OUR WORKING CATS: \n")
##
## (draft) IMPLEMENTATION FOR OUR WORKING CATS:
cat("library(randomForest)\n")
## library(randomForest)
cat("# Combine all cat features (no need to split categorical/numerical!)\n")
## # Combine all cat features (no need to split categorical/numerical!)
cat("adult_combined <- adult # Use all features together\n")
## adult_combined <- adult # Use all features together
cat("rf_model <- randomForest(Salary ~ ., data=adult_combined, ntree=500)\n")
## rf_model <- randomForest(Salary ~ ., data=adult_combined, ntree=500)
cat("print(rf_model) # Shows Out-of-Bag error rate (built-in cross-validation!)\n")
## print(rf_model) # Shows Out-of-Bag error rate (built-in cross-validation!)
cat("importance(rf_model) # Which features best predict fat cat status?\n")
## importance(rf_model) # Which features best predict fat cat status?
cat("varImpPlot(rf_model) # Visual ranking of important cat characteristics\n")
## varImpPlot(rf_model) # Visual ranking of important cat characteristics
cat("\n=== INTERPRETING RESULTS: ===\n")
##
## === INTERPRETING RESULTS: ===
cat("- OOB Error Rate: Built-in accuracy estimate (no separate test set needed!)\n")
## - OOB Error Rate: Built-in accuracy estimate (no separate test set needed!)
cat("- Feature Importance: Higher values = better predictors of fat cat status\n")
## - Feature Importance: Higher values = better predictors of fat cat status
cat("- Example likely results: Education > Hours_per_Week > Age > Occupation\n")
## - Example likely results: Education > Hours_per_Week > Age > Occupation
cat("\n=== ADVANTAGES FOR WORKING CATS ANALYSIS: ===\n")
##
## === ADVANTAGES FOR WORKING CATS ANALYSIS: ===
cat("1. NATURAL HANDLING: 'Married' cats vs 'Single' cats - no dummy variables!\n")
## 1. NATURAL HANDLING: 'Married' cats vs 'Single' cats - no dummy variables!
cat("2. MISSING DATA: If a cat's workclass is unknown, forest adapts automatically\n")
## 2. MISSING DATA: If a cat's workclass is unknown, forest adapts automatically
cat("3. NON-LINEAR: Can learn 'Young + Masters = Fat Cat' but 'Old + Masters = Not Fat Cat'\n")
## 3. NON-LINEAR: Can learn 'Young + Masters = Fat Cat' but 'Old + Masters = Not Fat Cat'
cat("4. CONFIDENCE: Can output probability of becoming fat cat (0-100%)\n")
## 4. CONFIDENCE: Can output probability of becoming fat cat (0-100%)
cat("5. EXPLANATION: Can trace decision path for any individual cat\n")
## 5. EXPLANATION: Can trace decision path for any individual cat
cat("\n=== HYPERPARAMETER TUNING FOR OPTIMAL FAT CAT PREDICTION: ===\n")
##
## === HYPERPARAMETER TUNING FOR OPTIMAL FAT CAT PREDICTION: ===
cat("- ntree: More trees = better accuracy (try 500-1000)\n")
## - ntree: More trees = better accuracy (try 500-1000)
cat("- mtry: Features per tree (default: sqrt(total_features) works well)\n")
## - mtry: Features per tree (default: sqrt(total_features) works well)
cat("- nodesize: Minimum cats per leaf (smaller = more detailed rules)\n")
## - nodesize: Minimum cats per leaf (smaller = more detailed rules)
cat("- maxnodes: Max decision points per tree (avoid processor-burning complexity)\n")
## - maxnodes: Max decision points per tree (avoid processor-burning complexity)
cat("\n=== FUTURE AWESOME RESEARCH DIRECTIONS: ===\n")
##
## === FUTURE AWESOME RESEARCH DIRECTIONS: ===
cat("- Gradient Boosting: Even more sophisticated ensemble method\n")
## - Gradient Boosting: Even more sophisticated ensemble method
cat("- XGBoost: Often wins machine learning competitions\n")
## - XGBoost: Often wins machine learning competitions
cat("- Deep Learning: Neural networks for understanding complex cat behavior patterns\n")
## - Deep Learning: Neural networks for understanding complex cat behavior patterns
cat("\nπ±π±π± FAT CAT PREDICTION ANALYSIS COMPLETED π±π±π±\n")
##
## π±π±π± FAT CAT PREDICTION ANALYSIS COMPLETED π±π±π±