R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load libraries
if (!require("randomForest")) install.packages("randomForest")
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
if (!require("caret")) install.packages("caret")
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
if (!require("dplyr")) install.packages("dplyr")
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
library(caret)
library(dplyr)

# Load dataset
income <- read.csv("C:/Users/yeu3178/Downloads/income.csv", strip.white = TRUE)

# Clean column names (remove extra spaces)
names(income) <- trimws(names(income))

# Drop 'fnlwgt' column (not useful for prediction)
income <- income %>% select(-fnlwgt)

# Convert target to factor
income$income <- as.factor(income$income)

# Convert all character variables to factors
income <- income %>%
  mutate(across(where(is.character), as.factor))

# Remove missing or unknown values (optional)
income <- income %>%
  filter(complete.cases(.))

# Set seed for reproducibility
set.seed(123)

# Split data: 70% training, 30% testing
train_index <- createDataPartition(income$income, p = 0.7, list = FALSE)
train_data <- income[train_index, ]
test_data  <- income[-train_index, ]

# Train Random Forest model
rf_model <- randomForest(income ~ ., data = train_data, ntree = 100, importance = TRUE)

# Predict on test set
predictions <- predict(rf_model, test_data)

# Evaluate with confusion matrix
conf_matrix <- confusionMatrix(predictions, test_data$income)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K >50K
##      <=50K  6881  825
##      >50K    535 1527
##                                           
##                Accuracy : 0.8608          
##                  95% CI : (0.8537, 0.8676)
##     No Information Rate : 0.7592          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6025          
##                                           
##  Mcnemar's Test P-Value : 4.629e-15       
##                                           
##             Sensitivity : 0.9279          
##             Specificity : 0.6492          
##          Pos Pred Value : 0.8929          
##          Neg Pred Value : 0.7405          
##              Prevalence : 0.7592          
##          Detection Rate : 0.7044          
##    Detection Prevalence : 0.7889          
##       Balanced Accuracy : 0.7885          
##                                           
##        'Positive' Class : <=50K           
## 
# Show feature importance
print(importance(rf_model))
##                    <=50K       >50K MeanDecreaseAccuracy MeanDecreaseGini
## age             3.187045 43.8012857            34.571891         862.5649
## workclass      17.816550 10.2697497            21.139305         280.1303
## education      23.440219  3.8471084            25.495615         480.9550
## education.num  15.463024  6.6994333            17.389904         497.2038
## marital.status 21.696565  9.7251914            18.871268         749.9790
## occupation     22.938980 34.3121892            49.212528         676.9711
## relationship   10.808329 15.1608875            14.607272         861.9121
## race            7.075884  3.7079604             8.433893         105.6482
## sex            17.930214  4.5334875            17.992969          91.1475
## capital.gain   63.947738 83.9044812            83.696436         869.3418
## capital.loss   20.771427 38.1122142            35.274118         235.7174
## hours.per.week  6.063176 32.9118685            27.319526         507.4152
## native.country 11.152617 -0.9787303             8.778527         202.1890
varImpPlot(rf_model)

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.