Import Data and Libraries
data <- read.csv("/Users/samwinkeler/Documents/USM/BAN 340/provided dataset.csv")
data_km <- data
data_tr <- data
library(class)
library(nnet)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
str(data)
## 'data.frame': 16281 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : chr " Private" " Private" " Local-gov" " Private" ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : chr " 11th" " HS-grad" " Assoc-acdm" " Some-college" ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: chr " Never-married" " Married-civ-spouse" " Married-civ-spouse" " Married-civ-spouse" ...
## $ occupation : chr " Machine-op-inspct" " Farming-fishing" " Protective-serv" " Machine-op-inspct" ...
## $ relationship : chr " Own-child" " Husband" " Husband" " Husband" ...
## $ race : chr " Black" " White" " White" " Black" ...
## $ sex : chr " Male" " Male" " Male" " Male" ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: chr " United-States" " United-States" " United-States" " United-States" ...
## $ income.class : chr " <=50K." " <=50K." " >50K." " >50K." ...
Turn Characters into Factors
data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$income.class <- as.factor(data$income.class)
data$native.country <- as.factor(data$native.country)
str(data)
## 'data.frame': 16281 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
## $ income.class : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...
Normalize Numerical Data
for (i in c(1,3,5,11:13)){
mincount <- min(data[,i])
maxcount <- max(data[,i])
data[,i] <- (data[,i]-mincount)/(maxcount-mincount)
}
str(data)
## 'data.frame': 16281 obs. of 15 variables:
## $ age : num 0.1096 0.2877 0.1507 0.3699 0.0137 ...
## $ workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ fnlwgt : num 0.1444 0.0517 0.219 0.0994 0.0609 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ education.num : num 0.4 0.533 0.733 0.6 0.6 ...
## $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : num 0 0 0 0.0769 0 ...
## $ capital.loss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: num 0.398 0.5 0.398 0.398 0.296 ...
## $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
## $ income.class : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...
Create Test and Train Data Sets
ran <- sample(16281,16281*.75)
test <- data[-ran,]
train <- data[ran,]
Create and Assess Neural Network
mynn <- nnet(income.class ~ ., data = train, size = 10, MaxNWts = 1750, maxit = 200)
## # weights: 1011
## initial value 8720.134294
## iter 10 value 4735.259488
## iter 20 value 4057.020285
## iter 30 value 3865.745570
## iter 40 value 3672.241106
## iter 50 value 3568.673556
## iter 60 value 3495.784589
## iter 70 value 3429.017109
## iter 80 value 3356.207528
## iter 90 value 3306.930169
## iter 100 value 3275.328863
## iter 110 value 3238.900207
## iter 120 value 3203.512117
## iter 130 value 3168.428754
## iter 140 value 3138.419060
## iter 150 value 3115.915221
## iter 160 value 3098.932957
## iter 170 value 3086.158564
## iter 180 value 3073.753560
## iter 190 value 3058.356522
## iter 200 value 3049.867392
## final value 3049.867392
## stopped after 200 iterations
estIclass <- predict(mynn, test, type = "class")
nntable <- table(test$income.class, estIclass)
accuracy_nn <- sum(diag(nntable)) / sum(nntable)
print(nntable)
## estIclass
## <=50K. >50K.
## <=50K. 2823 251
## >50K. 379 618
print(paste("Neural Network Accuracy:", round(accuracy_nn, 3)))
## [1] "Neural Network Accuracy: 0.845"
Create and Display Kmeans Clusters
k <- 3
km <- kmeans(data[, c(1, 3, 5, 11:13)], centers = k, nstart = 10)
data_km$cluster <- as.factor(km$cluster)
ggplot(data_km, aes(x = age, y = hours.per.week, color = cluster)) +
geom_point(size = 1.25) +
labs(title = "K-means Clustering", x = "Age", y = "Hours Per Week") +
theme_gray()

Create and Display Decision Tree
library(rpart.plot)
tree <- rpart(income.class ~ age + education.num + capital.gain + capital.loss, data = train, method = "class")
tree
## n= 12210
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 12210 2849 <=50K. (0.7666667 0.2333333)
## 2) capital.gain< 0.05119051 11665 2336 <=50K. (0.7997428 0.2002572)
## 4) education.num< 0.7666667 8956 1237 <=50K. (0.8618803 0.1381197) *
## 5) education.num>=0.7666667 2709 1099 <=50K. (0.5943152 0.4056848)
## 10) age< 0.1849315 676 96 <=50K. (0.8579882 0.1420118) *
## 11) age>=0.1849315 2033 1003 <=50K. (0.5066404 0.4933596)
## 22) capital.loss< 0.4990716 1892 878 <=50K. (0.5359408 0.4640592)
## 44) education.num< 0.9 1700 757 <=50K. (0.5547059 0.4452941) *
## 45) education.num>=0.9 192 71 >50K. (0.3697917 0.6302083) *
## 23) capital.loss>=0.4990716 141 16 >50K. (0.1134752 0.8865248) *
## 3) capital.gain>=0.05119051 545 32 >50K. (0.0587156 0.9412844) *
pred_tree <- predict(tree, test, type = "class")
pred_results <- table(test$income.class, pred_tree)
accuracy_tree <- sum(diag(pred_results)) / sum(pred_results)
print(paste("Decision Tree Accuracy:", round(accuracy_tree, 3)))
## [1] "Decision Tree Accuracy: 0.82"
rpart.plot(tree, main = "Decision Tree")
