R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Install required packages (run this only once)
install.packages("rpart", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rpart'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\rpart\libs\x64\rpart.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\rpart\libs\x64\rpart.dll:
## Permission denied
## Warning: restored 'rpart'
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("rpart.plot", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("caret", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("readr", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("dplyr", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("randomForest", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
# Load libraries
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Load the dataset
income <- read_csv("C:/Users/otuata4438/Downloads/income.csv")
## Rows: 32561 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): workclass, education, marital-status, occupation, relationship, rac...
## dbl (6): age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fix column names to be R-safe (e.g., education-num -> education.num)
names(income) <- make.names(names(income))

# Convert target variable to factor
income$income <- as.factor(income$income)

# Convert character columns to factors
char_cols <- sapply(income, is.character)
income[ , char_cols] <- lapply(income[ , char_cols], as.factor)

# Remove missing values
income <- na.omit(income)

# Split into training and test sets
set.seed(123)
train_index <- createDataPartition(income$income, p = 0.7, list = FALSE)
train_data <- income[train_index, ]
test_data <- income[-train_index, ]

# Train a decision tree model using rpart
tree_model <- rpart(income ~ ., data = train_data, method = "class")

# Plot the tree
rpart.plot(tree_model, type = 2, extra = 104, fallen.leaves = TRUE, main = "Decision Tree for Income Prediction")

# Optional: Predict on test data and evaluate
tree_predictions <- predict(tree_model, newdata = test_data, type = "class")
conf_matrix_tree <- confusionMatrix(tree_predictions, test_data$income)
print(conf_matrix_tree)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K >50K
##      <=50K  7030 1134
##      >50K    386 1218
##                                          
##                Accuracy : 0.8444         
##                  95% CI : (0.837, 0.8515)
##     No Information Rate : 0.7592         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5225         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9480         
##             Specificity : 0.5179         
##          Pos Pred Value : 0.8611         
##          Neg Pred Value : 0.7594         
##              Prevalence : 0.7592         
##          Detection Rate : 0.7197         
##    Detection Prevalence : 0.8358         
##       Balanced Accuracy : 0.7329         
##                                          
##        'Positive' Class : <=50K          
## 
# Train the Random Forest model
rf_model <- randomForest(income ~ ., data = train_data, ntree = 100, importance = TRUE)

# Predict on test data
rf_predictions <- predict(rf_model, newdata = test_data)

# Evaluate performance
conf_matrix_rf <- confusionMatrix(rf_predictions, test_data$income)
print(conf_matrix_rf)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K >50K
##      <=50K  6910  860
##      >50K    506 1492
##                                          
##                Accuracy : 0.8602         
##                  95% CI : (0.8531, 0.867)
##     No Information Rate : 0.7592         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5968         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9318         
##             Specificity : 0.6344         
##          Pos Pred Value : 0.8893         
##          Neg Pred Value : 0.7467         
##              Prevalence : 0.7592         
##          Detection Rate : 0.7074         
##    Detection Prevalence : 0.7955         
##       Balanced Accuracy : 0.7831         
##                                          
##        'Positive' Class : <=50K          
## 
# Plot variable importance
varImpPlot(rf_model, main = "Random Forest - Variable Importance")

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.