Package

library(caret)
library(neuralnet)
library(NeuralNetTools)
library(NeuralSens)
library(dplyr) #manipulasi data
library(readr) #read csv
library(VIM) #imputasi KNN
library(imbalance) #handling imbalance
library(caTools)

Load Data

raw <- read_csv("D://1. DATA MINING//income_evaluation.csv")
## Rows: 32561 Columns: 15
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): workclass, education, marital-status, occupation, relationship, rac...
## dbl (6): age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(raw)
## spc_tbl_ [32,561 x 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age           : num [1:32561] 39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : chr [1:32561] "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ fnlwgt        : num [1:32561] 77516 83311 215646 234721 338409 ...
##  $ education     : chr [1:32561] "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education-num : num [1:32561] 13 13 9 7 13 14 5 9 14 13 ...
##  $ marital-status: chr [1:32561] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
##  $ occupation    : chr [1:32561] "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship  : chr [1:32561] "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race          : chr [1:32561] "White" "White" "White" "Black" ...
##  $ sex           : chr [1:32561] "Male" "Male" "Male" "Male" ...
##  $ capital-gain  : num [1:32561] 2174 0 0 0 0 ...
##  $ capital-loss  : num [1:32561] 0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week: num [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
##  $ native-country: chr [1:32561] "United-States" "United-States" "United-States" "United-States" ...
##  $ income        : chr [1:32561] "<=50K" "<=50K" "<=50K" "<=50K" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   workclass = col_character(),
##   ..   fnlwgt = col_double(),
##   ..   education = col_character(),
##   ..   `education-num` = col_double(),
##   ..   `marital-status` = col_character(),
##   ..   occupation = col_character(),
##   ..   relationship = col_character(),
##   ..   race = col_character(),
##   ..   sex = col_character(),
##   ..   `capital-gain` = col_double(),
##   ..   `capital-loss` = col_double(),
##   ..   `hours-per-week` = col_double(),
##   ..   `native-country` = col_character(),
##   ..   income = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Preparation

data=data.frame(raw[,-3]) #buang variabel finalweight
data=replace(data,data=="?",NA) #mengubah character aneh menjadi NA
sum(is.na(data))
## [1] 4262
#membuang variabel education karena sudah ada education.num yang bertipe ordinal
data2=select(data,-education)%>%rename(education=education.num)
#imputasi KNN
data3 = kNN(data2, k=5, imp_var = FALSE)
sum(is.na(data3))
## [1] 0
data3$education=as.factor(data3$education)
data3=as.data.frame(unclass(data3), stringsAsFactors = TRUE)
data4=data3
data4[, c("workclass",'education','marital.status','occupation','relationship','race','sex','native.country','income')] =sapply(data3[, c('workclass','education','marital.status','occupation','relationship','race','sex','native.country','income')], unclass)
str(data4)
## 'data.frame':    32561 obs. of  13 variables:
##  $ age           : num  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : int  7 6 4 4 4 4 4 6 4 4 ...
##  $ education     : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: int  5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : int  1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : int  2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : int  5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : int  2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : num  2174 0 0 0 0 ...
##  $ capital.loss  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: num  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: int  39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : int  1 1 1 1 1 1 1 2 2 2 ...
#handling imbalance
imbalanceRatio(data4, classAttr = "income")
## [1] 0.3171926
set.seed(999)
data5 <- oversample(data4, ratio = 0.8, method = "SMOTE", classAttr = "income")
imbalanceRatio(data5, classAttr = "income")
## [1] 0.8

Split Data

Cara 1

set.seed(123) 
# create random split
sample <- sample.split(data5$income, SplitRatio = 0.8)
data.trn  <- subset(data5, sample == TRUE)
data.tst   <- subset(data5, sample == FALSE)

Cara 2

set.seed(999)

evaluationSetting <- trainControl(method='repeatedcv', 
                                  number=10, 
                                  repeats=1,
                                  summaryFunction = multiClassSummary)

metric <- "Accuracy"

Model

Cara 1

modelnn=neuralnet(income~.,data.trn,hidden=1)
plot(modelnn,rep="best")

pred <- predict(modelnn, data.tst)
table(data.tst$income == "1", pred[, 1] <1.5)
##        
##         FALSE TRUE
##   FALSE  2291 1664
##   TRUE    807 4137
#Banyak yang miss-clasification

Cara 2

nn_Model <- caret::train(factor(income)~.,
                    data=data5,
                    method="nnet",
                    metric=metric,
                    trControl=evaluationSetting,
                    trace=FALSE)
plotnet(nn_Model$finalModel)

print(nn_Model)
## Neural Network 
## 
## 44496 samples
##    12 predictor
##     2 classes: '1', '2' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 40046, 40046, 40047, 40046, 40047, 40046, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa      F1         Sensitivity  Specificity
##   1     0e+00  0.6438138  0.2232320  0.7474657  0.9342638    0.2807618  
##   1     1e-04  0.6664416  0.2746970  0.7598019  0.9283981    0.3389910  
##   1     1e-01  0.6904446  0.3327855  0.7730362  0.9357201    0.3838547  
##   3     0e+00  0.7780926  0.5574346  0.7850118  0.7472492    0.8166514  
##   3     1e-04  0.7664759  0.5171592  0.8022363  0.8388754    0.6759884  
##   3     1e-01  0.7666309  0.5279679  0.7836867  0.7756877    0.7553225  
##   5     0e+00  0.7790133  0.5561377  0.7929209  0.7703074    0.7899017  
##   5     1e-04  0.7594841  0.5168527  0.7704982  0.7470874    0.7749942  
##   5     1e-01  0.7677757  0.5370639  0.7754089  0.7332120    0.8109841  
##   Pos_Pred_Value  Neg_Pred_Value  Precision  Recall     Detection_Rate
##   0.6353469       0.8011741       0.6353469  0.9342638  0.5190351     
##   0.6622817       0.8599289       0.6622817  0.9283981  0.5157776     
##   0.6691255       0.8772121       0.6691255  0.9357201  0.5198439     
##   0.8425664       0.7319881       0.8425664  0.7472492  0.4151377     
##   0.7815381       0.7837025       0.7815381  0.8388754  0.4660409     
##   0.8167191       0.7545395       0.8167191  0.7756877  0.4309353     
##   0.8263341       0.7416072       0.8263341  0.7703074  0.4279471     
##   0.8251586       0.7367649       0.8251586  0.7470874  0.4150458     
##   0.8335037       0.7164434       0.8335037  0.7332120  0.4073386     
##   Balanced_Accuracy
##   0.6075128        
##   0.6336945        
##   0.6597874        
##   0.7819503        
##   0.7574319        
##   0.7655051        
##   0.7801046        
##   0.7610408        
##   0.7720980        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.