This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Install required packages (run this only once)
install.packages("rpart", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rpart'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\rpart\libs\x64\rpart.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\rpart\libs\x64\rpart.dll:
## Permission denied
## Warning: restored 'rpart'
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("rpart.plot", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("caret", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("readr", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("dplyr", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to
## C:\Users\otuata4438\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
install.packages("randomForest", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/otuata4438/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\otuata4438\AppData\Local\Temp\RtmpaU8OvB\downloaded_packages
# Load libraries
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Load the dataset
income <- read_csv("C:/Users/otuata4438/Downloads/income.csv")
## Rows: 32561 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): workclass, education, marital-status, occupation, relationship, rac...
## dbl (6): age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fix column names to be R-safe (e.g., education-num -> education.num)
names(income) <- make.names(names(income))
# Convert target variable to factor
income$income <- as.factor(income$income)
# Convert character columns to factors
char_cols <- sapply(income, is.character)
income[ , char_cols] <- lapply(income[ , char_cols], as.factor)
# Remove missing values
income <- na.omit(income)
# Split into training and test sets
set.seed(123)
train_index <- createDataPartition(income$income, p = 0.7, list = FALSE)
train_data <- income[train_index, ]
test_data <- income[-train_index, ]
# Train a decision tree model using rpart
tree_model <- rpart(income ~ ., data = train_data, method = "class")
# Plot the tree
rpart.plot(tree_model, type = 2, extra = 104, fallen.leaves = TRUE, main = "Decision Tree for Income Prediction")
# Optional: Predict on test data and evaluate
tree_predictions <- predict(tree_model, newdata = test_data, type = "class")
conf_matrix_tree <- confusionMatrix(tree_predictions, test_data$income)
print(conf_matrix_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 7030 1134
## >50K 386 1218
##
## Accuracy : 0.8444
## 95% CI : (0.837, 0.8515)
## No Information Rate : 0.7592
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5225
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9480
## Specificity : 0.5179
## Pos Pred Value : 0.8611
## Neg Pred Value : 0.7594
## Prevalence : 0.7592
## Detection Rate : 0.7197
## Detection Prevalence : 0.8358
## Balanced Accuracy : 0.7329
##
## 'Positive' Class : <=50K
##
# Train the Random Forest model
rf_model <- randomForest(income ~ ., data = train_data, ntree = 100, importance = TRUE)
# Predict on test data
rf_predictions <- predict(rf_model, newdata = test_data)
# Evaluate performance
conf_matrix_rf <- confusionMatrix(rf_predictions, test_data$income)
print(conf_matrix_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 6910 860
## >50K 506 1492
##
## Accuracy : 0.8602
## 95% CI : (0.8531, 0.867)
## No Information Rate : 0.7592
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5968
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9318
## Specificity : 0.6344
## Pos Pred Value : 0.8893
## Neg Pred Value : 0.7467
## Prevalence : 0.7592
## Detection Rate : 0.7074
## Detection Prevalence : 0.7955
## Balanced Accuracy : 0.7831
##
## 'Positive' Class : <=50K
##
# Plot variable importance
varImpPlot(rf_model, main = "Random Forest - Variable Importance")
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.