Import Data and Libraries

data <- read.csv("/Users/samwinkeler/Documents/USM/BAN 340/provided dataset.csv")
data_km <- data
data_tr <- data
library(class)
library(nnet)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

str(data)

## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : chr  " Private" " Private" " Local-gov" " Private" ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : chr  " 11th" " HS-grad" " Assoc-acdm" " Some-college" ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: chr  " Never-married" " Married-civ-spouse" " Married-civ-spouse" " Married-civ-spouse" ...
##  $ occupation    : chr  " Machine-op-inspct" " Farming-fishing" " Protective-serv" " Machine-op-inspct" ...
##  $ relationship  : chr  " Own-child" " Husband" " Husband" " Husband" ...
##  $ race          : chr  " Black" " White" " White" " Black" ...
##  $ sex           : chr  " Male" " Male" " Male" " Male" ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ income.class  : chr  " <=50K." " <=50K." " >50K." " >50K." ...

Turn Characters into Factors

data$workclass <- as.factor(data$workclass)
data$education <- as.factor(data$education)
data$marital.status <- as.factor(data$marital.status)
data$occupation <- as.factor(data$occupation)
data$relationship <- as.factor(data$relationship)
data$race <- as.factor(data$race)
data$sex <- as.factor(data$sex)
data$income.class <- as.factor(data$income.class)
data$native.country <- as.factor(data$native.country)
str(data)

## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ relationship  : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
##  $ income.class  : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...

Normalize Numerical Data

for (i in c(1,3,5,11:13)){
  mincount <- min(data[,i])
  maxcount <- max(data[,i])
  data[,i] <- (data[,i]-mincount)/(maxcount-mincount)
}
str(data)

## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : num  0.1096 0.2877 0.1507 0.3699 0.0137 ...
##  $ workclass     : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
##  $ fnlwgt        : num  0.1444 0.0517 0.219 0.0994 0.0609 ...
##  $ education     : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : num  0.4 0.533 0.733 0.6 0.6 ...
##  $ marital.status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ relationship  : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : num  0 0 0 0.0769 0 ...
##  $ capital.loss  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: num  0.398 0.5 0.398 0.398 0.296 ...
##  $ native.country: Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
##  $ income.class  : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...

Create Test and Train Data Sets

ran <- sample(16281,16281*.75)
test <- data[-ran,]
train <- data[ran,]

Create and Assess Neural Network

mynn <- nnet(income.class ~ ., data = train, size = 10, MaxNWts = 1750, maxit = 200)

## # weights:  1011
## initial  value 8720.134294 
## iter  10 value 4735.259488
## iter  20 value 4057.020285
## iter  30 value 3865.745570
## iter  40 value 3672.241106
## iter  50 value 3568.673556
## iter  60 value 3495.784589
## iter  70 value 3429.017109
## iter  80 value 3356.207528
## iter  90 value 3306.930169
## iter 100 value 3275.328863
## iter 110 value 3238.900207
## iter 120 value 3203.512117
## iter 130 value 3168.428754
## iter 140 value 3138.419060
## iter 150 value 3115.915221
## iter 160 value 3098.932957
## iter 170 value 3086.158564
## iter 180 value 3073.753560
## iter 190 value 3058.356522
## iter 200 value 3049.867392
## final  value 3049.867392 
## stopped after 200 iterations

estIclass <- predict(mynn, test, type = "class")
nntable <- table(test$income.class, estIclass)
accuracy_nn <- sum(diag(nntable)) / sum(nntable)
print(nntable)

##          estIclass
##            <=50K.  >50K.
##    <=50K.    2823    251
##    >50K.      379    618

print(paste("Neural Network Accuracy:", round(accuracy_nn, 3)))

## [1] "Neural Network Accuracy: 0.845"

Create and Display Kmeans Clusters

k <- 3
km <- kmeans(data[, c(1, 3, 5, 11:13)], centers = k, nstart = 10)
data_km$cluster <- as.factor(km$cluster)
ggplot(data_km, aes(x = age, y = hours.per.week, color = cluster)) + 
  geom_point(size = 1.25) + 
  labs(title = "K-means Clustering", x = "Age", y = "Hours Per Week") + 
  theme_gray()

Create and Display Decision Tree

library(rpart.plot)
tree <- rpart(income.class ~ age + education.num + capital.gain + capital.loss, data = train, method = "class")
tree

## n= 12210 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 12210 2849  <=50K. (0.7666667 0.2333333)  
##    2) capital.gain< 0.05119051 11665 2336  <=50K. (0.7997428 0.2002572)  
##      4) education.num< 0.7666667 8956 1237  <=50K. (0.8618803 0.1381197) *
##      5) education.num>=0.7666667 2709 1099  <=50K. (0.5943152 0.4056848)  
##       10) age< 0.1849315 676   96  <=50K. (0.8579882 0.1420118) *
##       11) age>=0.1849315 2033 1003  <=50K. (0.5066404 0.4933596)  
##         22) capital.loss< 0.4990716 1892  878  <=50K. (0.5359408 0.4640592)  
##           44) education.num< 0.9 1700  757  <=50K. (0.5547059 0.4452941) *
##           45) education.num>=0.9 192   71  >50K. (0.3697917 0.6302083) *
##         23) capital.loss>=0.4990716 141   16  >50K. (0.1134752 0.8865248) *
##    3) capital.gain>=0.05119051 545   32  >50K. (0.0587156 0.9412844) *

pred_tree <- predict(tree, test, type = "class")
pred_results <- table(test$income.class, pred_tree)
accuracy_tree <- sum(diag(pred_results)) / sum(pred_results)
print(paste("Decision Tree Accuracy:", round(accuracy_tree, 3)))

## [1] "Decision Tree Accuracy: 0.82"

rpart.plot(tree, main = "Decision Tree")

BAN340 - Final Project

Sam Winkeler, Abdilraouf Mohamed, Azam Jiwa, Charles Junior Shamukiga

2024-12-15

Import Data and Libraries

Turn Characters into Factors

Normalize Numerical Data

Create Test and Train Data Sets

Create and Assess Neural Network

Create and Display Kmeans Clusters

Create and Display Decision Tree