Import Data and Libraries

data <- read.csv("/Users/samwinkeler/Documents/USM/BAN 340/provided dataset.csv")
data_km <- data
data_tr <- data
library(class)
library(nnet)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
str(data)
## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : chr  " Private" " Private" " Local-gov" " Private" ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : chr  " 11th" " HS-grad" " Assoc-acdm" " Some-college" ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: chr  " Never-married" " Married-civ-spouse" " Married-civ-spouse" " Married-civ-spouse" ...
##  $ occupation    : chr  " Machine-op-inspct" " Farming-fishing" " Protective-serv" " Machine-op-inspct" ...
##  $ relationship  : chr  " Own-child" " Husband" " Husband" " Husband" ...
##  $ race          : chr  " Black" " White" " White" " Black" ...
##  $ sex           : chr  " Male" " Male" " Male" " Male" ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ income.class  : chr  " <=50K." " <=50K." " >50K." " >50K." ...

Turn Characters into Factors

data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$income.class <- as.factor(data$income.class)
data$native.country <- as.factor(data$native.country)
str(data)
## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ relationship  : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
##  $ income.class  : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...

Normalize Numerical Data

for (i in c(1,3,5,11:13)){
  mincount <- min(data[,i])
  maxcount <- max(data[,i])
  data[,i] <- (data[,i]-mincount)/(maxcount-mincount)
}
str(data)
## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : num  0.1096 0.2877 0.1507 0.3699 0.0137 ...
##  $ workclass     : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
##  $ fnlwgt        : num  0.1444 0.0517 0.219 0.0994 0.0609 ...
##  $ education     : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : num  0.4 0.533 0.733 0.6 0.6 ...
##  $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ relationship  : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : num  0 0 0 0.0769 0 ...
##  $ capital.loss  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: num  0.398 0.5 0.398 0.398 0.296 ...
##  $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
##  $ income.class  : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...

Create Test and Train Data Sets

ran <- sample(16281,16281*.75)
test <- data[-ran,]
train <- data[ran,]

Create and Assess Neural Network

mynn <- nnet(income.class ~ ., data = train, size = 10, MaxNWts = 1750, maxit = 200)
## # weights:  1011
## initial  value 8720.134294 
## iter  10 value 4735.259488
## iter  20 value 4057.020285
## iter  30 value 3865.745570
## iter  40 value 3672.241106
## iter  50 value 3568.673556
## iter  60 value 3495.784589
## iter  70 value 3429.017109
## iter  80 value 3356.207528
## iter  90 value 3306.930169
## iter 100 value 3275.328863
## iter 110 value 3238.900207
## iter 120 value 3203.512117
## iter 130 value 3168.428754
## iter 140 value 3138.419060
## iter 150 value 3115.915221
## iter 160 value 3098.932957
## iter 170 value 3086.158564
## iter 180 value 3073.753560
## iter 190 value 3058.356522
## iter 200 value 3049.867392
## final  value 3049.867392 
## stopped after 200 iterations
estIclass <- predict(mynn, test, type = "class")
nntable <- table(test$income.class, estIclass)
accuracy_nn <- sum(diag(nntable)) / sum(nntable)
print(nntable)
##          estIclass
##            <=50K.  >50K.
##    <=50K.    2823    251
##    >50K.      379    618
print(paste("Neural Network Accuracy:", round(accuracy_nn, 3)))
## [1] "Neural Network Accuracy: 0.845"

Create and Display Kmeans Clusters

k <- 3
km <- kmeans(data[, c(1, 3, 5, 11:13)], centers = k, nstart = 10)
data_km$cluster <- as.factor(km$cluster)
ggplot(data_km, aes(x = age, y = hours.per.week, color = cluster)) + 
  geom_point(size = 1.25) + 
  labs(title = "K-means Clustering", x = "Age", y = "Hours Per Week") + 
  theme_gray()

Create and Display Decision Tree

library(rpart.plot)
tree <- rpart(income.class ~ age + education.num + capital.gain + capital.loss, data = train, method = "class")
tree
## n= 12210 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 12210 2849  <=50K. (0.7666667 0.2333333)  
##    2) capital.gain< 0.05119051 11665 2336  <=50K. (0.7997428 0.2002572)  
##      4) education.num< 0.7666667 8956 1237  <=50K. (0.8618803 0.1381197) *
##      5) education.num>=0.7666667 2709 1099  <=50K. (0.5943152 0.4056848)  
##       10) age< 0.1849315 676   96  <=50K. (0.8579882 0.1420118) *
##       11) age>=0.1849315 2033 1003  <=50K. (0.5066404 0.4933596)  
##         22) capital.loss< 0.4990716 1892  878  <=50K. (0.5359408 0.4640592)  
##           44) education.num< 0.9 1700  757  <=50K. (0.5547059 0.4452941) *
##           45) education.num>=0.9 192   71  >50K. (0.3697917 0.6302083) *
##         23) capital.loss>=0.4990716 141   16  >50K. (0.1134752 0.8865248) *
##    3) capital.gain>=0.05119051 545   32  >50K. (0.0587156 0.9412844) *
pred_tree <- predict(tree, test, type = "class")
pred_results <- table(test$income.class, pred_tree)
accuracy_tree <- sum(diag(pred_results)) / sum(pred_results)
print(paste("Decision Tree Accuracy:", round(accuracy_tree, 3)))
## [1] "Decision Tree Accuracy: 0.82"
rpart.plot(tree, main = "Decision Tree")