Package
library(caret)
library(neuralnet)
library(NeuralNetTools)
library(NeuralSens)
library(dplyr) #manipulasi data
library(readr) #read csv
library(VIM) #imputasi KNN
library(imbalance) #handling imbalance
library(caTools)
Load Data
raw <- read_csv("D://1. DATA MINING//income_evaluation.csv")
## Rows: 32561 Columns: 15
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): workclass, education, marital-status, occupation, relationship, rac...
## dbl (6): age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(raw)
## spc_tbl_ [32,561 x 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:32561] 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr [1:32561] "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : num [1:32561] 77516 83311 215646 234721 338409 ...
## $ education : chr [1:32561] "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education-num : num [1:32561] 13 13 9 7 13 14 5 9 14 13 ...
## $ marital-status: chr [1:32561] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ occupation : chr [1:32561] "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr [1:32561] "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr [1:32561] "White" "White" "White" "Black" ...
## $ sex : chr [1:32561] "Male" "Male" "Male" "Male" ...
## $ capital-gain : num [1:32561] 2174 0 0 0 0 ...
## $ capital-loss : num [1:32561] 0 0 0 0 0 0 0 0 0 0 ...
## $ hours-per-week: num [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
## $ native-country: chr [1:32561] "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr [1:32561] "<=50K" "<=50K" "<=50K" "<=50K" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. workclass = col_character(),
## .. fnlwgt = col_double(),
## .. education = col_character(),
## .. `education-num` = col_double(),
## .. `marital-status` = col_character(),
## .. occupation = col_character(),
## .. relationship = col_character(),
## .. race = col_character(),
## .. sex = col_character(),
## .. `capital-gain` = col_double(),
## .. `capital-loss` = col_double(),
## .. `hours-per-week` = col_double(),
## .. `native-country` = col_character(),
## .. income = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Data Preparation
data=data.frame(raw[,-3]) #buang variabel finalweight
data=replace(data,data=="?",NA) #mengubah character aneh menjadi NA
sum(is.na(data))
## [1] 4262
#membuang variabel education karena sudah ada education.num yang bertipe ordinal
data2=select(data,-education)%>%rename(education=education.num)
#imputasi KNN
data3 = kNN(data2, k=5, imp_var = FALSE)
sum(is.na(data3))
## [1] 0
data3$education=as.factor(data3$education)
data3=as.data.frame(unclass(data3), stringsAsFactors = TRUE)
data4=data3
data4[, c("workclass",'education','marital.status','occupation','relationship','race','sex','native.country','income')] =sapply(data3[, c('workclass','education','marital.status','occupation','relationship','race','sex','native.country','income')], unclass)
str(data4)
## 'data.frame': 32561 obs. of 13 variables:
## $ age : num 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : int 7 6 4 4 4 4 4 6 4 4 ...
## $ education : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: int 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : int 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : int 2 1 2 1 6 6 2 1 2 1 ...
## $ race : int 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : int 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : num 2174 0 0 0 0 ...
## $ capital.loss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: num 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: int 39 39 39 39 5 39 23 39 39 39 ...
## $ income : int 1 1 1 1 1 1 1 2 2 2 ...
#handling imbalance
imbalanceRatio(data4, classAttr = "income")
## [1] 0.3171926
set.seed(999)
data5 <- oversample(data4, ratio = 0.8, method = "SMOTE", classAttr = "income")
imbalanceRatio(data5, classAttr = "income")
## [1] 0.8
Split Data
Cara 1
set.seed(123)
# create random split
sample <- sample.split(data5$income, SplitRatio = 0.8)
data.trn <- subset(data5, sample == TRUE)
data.tst <- subset(data5, sample == FALSE)
Cara 2
set.seed(999)
evaluationSetting <- trainControl(method='repeatedcv',
number=10,
repeats=1,
summaryFunction = multiClassSummary)
metric <- "Accuracy"
Model
Cara 1
modelnn=neuralnet(income~.,data.trn,hidden=1)
plot(modelnn,rep="best")

pred <- predict(modelnn, data.tst)
table(data.tst$income == "1", pred[, 1] <1.5)
##
## FALSE TRUE
## FALSE 2291 1664
## TRUE 807 4137
#Banyak yang miss-clasification
Cara 2
nn_Model <- caret::train(factor(income)~.,
data=data5,
method="nnet",
metric=metric,
trControl=evaluationSetting,
trace=FALSE)
plotnet(nn_Model$finalModel)

print(nn_Model)
## Neural Network
##
## 44496 samples
## 12 predictor
## 2 classes: '1', '2'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 40046, 40046, 40047, 40046, 40047, 40046, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa F1 Sensitivity Specificity
## 1 0e+00 0.6438138 0.2232320 0.7474657 0.9342638 0.2807618
## 1 1e-04 0.6664416 0.2746970 0.7598019 0.9283981 0.3389910
## 1 1e-01 0.6904446 0.3327855 0.7730362 0.9357201 0.3838547
## 3 0e+00 0.7780926 0.5574346 0.7850118 0.7472492 0.8166514
## 3 1e-04 0.7664759 0.5171592 0.8022363 0.8388754 0.6759884
## 3 1e-01 0.7666309 0.5279679 0.7836867 0.7756877 0.7553225
## 5 0e+00 0.7790133 0.5561377 0.7929209 0.7703074 0.7899017
## 5 1e-04 0.7594841 0.5168527 0.7704982 0.7470874 0.7749942
## 5 1e-01 0.7677757 0.5370639 0.7754089 0.7332120 0.8109841
## Pos_Pred_Value Neg_Pred_Value Precision Recall Detection_Rate
## 0.6353469 0.8011741 0.6353469 0.9342638 0.5190351
## 0.6622817 0.8599289 0.6622817 0.9283981 0.5157776
## 0.6691255 0.8772121 0.6691255 0.9357201 0.5198439
## 0.8425664 0.7319881 0.8425664 0.7472492 0.4151377
## 0.7815381 0.7837025 0.7815381 0.8388754 0.4660409
## 0.8167191 0.7545395 0.8167191 0.7756877 0.4309353
## 0.8263341 0.7416072 0.8263341 0.7703074 0.4279471
## 0.8251586 0.7367649 0.8251586 0.7470874 0.4150458
## 0.8335037 0.7164434 0.8335037 0.7332120 0.4073386
## Balanced_Accuracy
## 0.6075128
## 0.6336945
## 0.6597874
## 0.7819503
## 0.7574319
## 0.7655051
## 0.7801046
## 0.7610408
## 0.7720980
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.