Predict if people in the data set belong in a certain class by salary, either making <=50k or >50k per year by comparing three models : Logistics Regression, Decision Tree and RandomForest.
suppressWarnings(library(readr))
adult <- read_csv("C:/Users/dannyhuang/Desktop/adult.csv")
## Parsed with column specification:
## cols(
## age = col_integer(),
## workclass = col_character(),
## fnlwgt = col_integer(),
## education = col_character(),
## `educational-num` = col_integer(),
## `marital-status` = col_character(),
## occupation = col_character(),
## relationship = col_character(),
## race = col_character(),
## gender = col_character(),
## `capital-gain` = col_integer(),
## `capital-loss` = col_integer(),
## `hours-per-week` = col_integer(),
## `native-country` = col_character(),
## income = col_character()
## )
head(adult,10)
## # A tibble: 10 × 15
## age workclass fnlwgt education `educational-num`
## <int> <chr> <int> <chr> <int>
## 1 25 Private 226802 11th 7
## 2 38 Private 89814 HS-grad 9
## 3 28 Local-gov 336951 Assoc-acdm 12
## 4 44 Private 160323 Some-college 10
## 5 18 ? 103497 Some-college 10
## 6 34 Private 198693 10th 6
## 7 29 ? 227026 HS-grad 9
## 8 63 Self-emp-not-inc 104626 Prof-school 15
## 9 24 Private 369667 Some-college 10
## 10 55 Private 104996 7th-8th 4
## # ... with 10 more variables: `marital-status` <chr>, occupation <chr>,
## # relationship <chr>, race <chr>, gender <chr>, `capital-gain` <int>,
## # `capital-loss` <int>, `hours-per-week` <int>, `native-country` <chr>,
## # income <chr>
str(adult)
## Classes 'tbl_df', 'tbl' and 'data.frame': 48842 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : chr "Private" "Private" "Local-gov" "Private" ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : chr "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
## $ educational-num: int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital-status : chr "Never-married" "Married-civ-spouse" "Married-civ-spouse" "Married-civ-spouse" ...
## $ occupation : chr "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
## $ relationship : chr "Own-child" "Husband" "Husband" "Husband" ...
## $ race : chr "Black" "White" "White" "Black" ...
## $ gender : chr "Male" "Male" "Male" "Male" ...
## $ capital-gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital-loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours-per-week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ native-country : chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" ">50K" ">50K" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 15
## .. ..$ age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ workclass : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ fnlwgt : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ education : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ educational-num: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ marital-status : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ occupation : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ relationship : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ race : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ capital-gain : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ capital-loss : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ hours-per-week : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ native-country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ income : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
table(adult$workclass)
##
## ? Federal-gov Local-gov Never-worked
## 2799 1432 3136 10
## Private Self-emp-inc Self-emp-not-inc State-gov
## 33906 1695 3862 1981
## Without-pay
## 21
### 2799 NULL Values
unemployed <- function(job){
job <- as.character(job)
if (job=='Never-worked' | job=='Without-pay'){
return('Unemployed')
}else{
return(job)
}
}
adult$workclass <- sapply(adult$workclass,unemployed)
table(adult$workclass)
##
## ? Federal-gov Local-gov Private
## 2799 1432 3136 33906
## Self-emp-inc Self-emp-not-inc State-gov Unemployed
## 1695 3862 1981 31
group_emp <- function(job){
if (job=='Local-gov' | job=='State-gov'){
return('SL-gov')
}else if (job=='Self-emp-inc' | job=='Self-emp-not-inc'){
return('self-emp')
}else{
return(job)
}
}
adult$workclass <- sapply(adult$workclass,group_emp)
table(adult$workclass)
##
## ? Federal-gov Private self-emp SL-gov Unemployed
## 2799 1432 33906 5557 5117 31
table(adult$`marital-status`)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 6633 37 22379
## Married-spouse-absent Never-married Separated
## 628 16117 1530
## Widowed
## 1518
group_marital <- function(mar){
mar <- as.character(mar)
# Not-Married
if (mar=='Separated' | mar=='Divorced' | mar=='Widowed'){
return('Not-Married')
# Never-Married
}else if(mar=='Never-married'){
return(mar)
#Married
}else{
return('Married')
}
}
adult$`marital-status` <- sapply(adult$`marital-status` ,group_marital)
table(adult$`marital-status`)
##
## Married Never-married Not-Married
## 23044 16117 9681
table(adult$country)
## Warning: Unknown column 'country'
## < table of extent 0 >
unique(adult$`native-country`)
## [1] "United-States" "?"
## [3] "Peru" "Guatemala"
## [5] "Mexico" "Dominican-Republic"
## [7] "Ireland" "Germany"
## [9] "Philippines" "Thailand"
## [11] "Haiti" "El-Salvador"
## [13] "Puerto-Rico" "Vietnam"
## [15] "South" "Columbia"
## [17] "Japan" "India"
## [19] "Cambodia" "Poland"
## [21] "Laos" "England"
## [23] "Cuba" "Taiwan"
## [25] "Italy" "Canada"
## [27] "Portugal" "China"
## [29] "Nicaragua" "Honduras"
## [31] "Iran" "Scotland"
## [33] "Jamaica" "Ecuador"
## [35] "Yugoslavia" "Hungary"
## [37] "Hong" "Greece"
## [39] "Trinadad&Tobago" "Outlying-US(Guam-USVI-etc)"
## [41] "France" "Holand-Netherlands"
Asia <- c('China','Hong','India','Iran','Cambodia','Japan', 'Laos' ,
'Philippines' ,'Vietnam' ,'Taiwan', 'Thailand')
North.America <- c('Canada','United-States','Puerto-Rico' )
Europe <- c('England' ,'France', 'Germany' ,'Greece','Holand-Netherlands','Hungary',
'Ireland','Italy','Poland','Portugal','Scotland','Yugoslavia')
Latin.and.South.America <- c('Columbia','Cuba','Dominican-Republic','Ecuador',
'El-Salvador','Guatemala','Haiti','Honduras',
'Mexico','Nicaragua','Outlying-US(Guam-USVI-etc)','Peru',
'Jamaica','Trinadad&Tobago')
Other <- c('South')
group_country <- function(ctry){
if (ctry %in% Asia){
return('Asia')
}else if (ctry %in% North.America){
return('North.America')
}else if (ctry %in% Europe){
return('Europe')
}else if (ctry %in% Latin.and.South.America){
return('Latin.and.South.America')
}else{
return('Other')
}
}
adult$`native-country` <- sapply(adult$`native-country`,group_country)
table(adult$`native-country`)
##
## Asia Europe Latin.and.South.America
## 981 780 1911
## North.America Other
## 44198 972
str(adult)
## Classes 'tbl_df', 'tbl' and 'data.frame': 48842 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : chr "Private" "Private" "SL-gov" "Private" ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : chr "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
## $ educational-num: int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital-status : chr "Never-married" "Married" "Married" "Married" ...
## $ occupation : chr "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
## $ relationship : chr "Own-child" "Husband" "Husband" "Husband" ...
## $ race : chr "Black" "White" "White" "Black" ...
## $ gender : chr "Male" "Male" "Male" "Male" ...
## $ capital-gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital-loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours-per-week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ native-country : chr "North.America" "North.America" "North.America" "North.America" ...
## $ income : chr "<=50K" "<=50K" ">50K" ">50K" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 15
## .. ..$ age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ workclass : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ fnlwgt : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ education : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ educational-num: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ marital-status : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ occupation : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ relationship : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ race : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ capital-gain : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ capital-loss : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ hours-per-week : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ native-country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ income : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)
adult[adult == '4'] <- NA
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.3.2
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
table(adult$workclass)
##
## Private SL-gov ? self-emp Federal-gov Unemployed
## 33906 5117 2799 5557 1432 31
adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)
adult$occupation <- sapply(adult$occupation,factor)
adult$relationship <- sapply(adult$relationship,factor)
adult$income <- sapply(adult$income,factor)
adult$education<- sapply(adult$education,factor)
adult$gender<- sapply(adult$gender,factor)
adult$race<- sapply(adult$race,factor)
adult <- na.omit(adult)
missmap(adult)
## Warning in if (class(obj) == "amelia") {: 條件的長度 > 1,因此只能用其第一
## 元素
## Warning: Unknown column 'arguments'
## Warning: Unknown column 'arguments'
## Warning: Unknown column 'imputations'
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.2
sample <- sample.split(adult$income, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE
# Training Data
train = subset(adult, sample == TRUE)
# Testing Data
test = subset(adult, sample == FALSE) #for logistics model
test2= test # for Desicison Tree
test3=test # for RandomForst
suppressWarnings(library(caret))
## Loading required package: lattice
glm_model = glm(income ~ ., family = binomial(logit), data = train)
test$predicted.income = predict(glm_model, newdata=test, type="response")
set.seed(1)
test$income_class <- ifelse(test$predicted.income > 0.5, ">50K","<=50K")
glm_con <-confusionMatrix(test$income_class,test$income)
glm_con
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 10077 1359
## >50K 779 2127
##
## Accuracy : 0.8509
## 95% CI : (0.845, 0.8567)
## No Information Rate : 0.7569
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5706
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9282
## Specificity : 0.6102
## Pos Pred Value : 0.8812
## Neg Pred Value : 0.7319
## Prevalence : 0.7569
## Detection Rate : 0.7026
## Detection Prevalence : 0.7974
## Balanced Accuracy : 0.7692
##
## 'Positive' Class : <=50K
##
suppressWarnings(library(caTools))
suppressWarnings(library(caTools))
colAUC(test$predicted.income,test$income, plotROC = TRUE)
## [,1]
## <=50K vs. >50K 0.9077893
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.2
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.3.2
suppressWarnings(library(rpart))
suppressWarnings(library(ROCR))
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
names(train) <- make.names(names(train))
names(test2) <- make.names(names(test2))
tree_model <- rpart(income ~ ., train, method = "class")
all_probs <- predict(tree_model, test2, type = "prob")
test2$income_class <- ifelse(all_probs[,1]>0.5,"<=50K",">50K")
dt_con <- confusionMatrix(test2$income_class,test2$income)
dt_con
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 10281 1663
## >50K 575 1823
##
## Accuracy : 0.844
## 95% CI : (0.8379, 0.8499)
## No Information Rate : 0.7569
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5257
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9470
## Specificity : 0.5229
## Pos Pred Value : 0.8608
## Neg Pred Value : 0.7602
## Prevalence : 0.7569
## Detection Rate : 0.7168
## Detection Prevalence : 0.8328
## Balanced Accuracy : 0.7350
##
## 'Positive' Class : <=50K
##
suppressWarnings(library(randomForest))
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
names(train) <- make.names(names(train))
names(test3) <- make.names(names(test3))
set.seed(32423)
rfFit<- randomForest(income~.,data= train)
print(rfFit)
##
## Call:
## randomForest(formula = income ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.72%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 23777 1554 0.06134776
## >50K 3038 5095 0.37353990
rf_pred <- predict(rfFit,test3,type = "class")
rf_con<- confusionMatrix(rf_pred, test3$income)
rf_con
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 10181 1265
## >50K 675 2221
##
## Accuracy : 0.8647
## 95% CI : (0.859, 0.8703)
## No Information Rate : 0.7569
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.61
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9378
## Specificity : 0.6371
## Pos Pred Value : 0.8895
## Neg Pred Value : 0.7669
## Prevalence : 0.7569
## Detection Rate : 0.7099
## Detection Prevalence : 0.7981
## Balanced Accuracy : 0.7875
##
## 'Positive' Class : <=50K
##
glmAcu <- glm_con$overall[1]
dtAcu<- dt_con$overall[1]
rfAcu<- rf_con$overall[1]
ACU <- data.frame(Model=c("Decision Tree","Logistic Regression","Random Forest"),Accuracy=c(dtAcu,glmAcu,rfAcu))
ggplot(ACU,aes(x=Model,y=Accuracy,fill=Model))+geom_bar(stat = 'identity')+theme_bw()+ggtitle('Accuracies of Models')
#It shows that, based on Accuracy, Random Forest Model is a relatively better.
suppressWarnings(library(mlr))
## Loading required package: ParamHelpers
##
## Attaching package: 'mlr'
## The following object is masked from 'package:ROCR':
##
## performance
## The following object is masked from 'package:caret':
##
## train
getParamSet("classif.randomForest")
## Type len Def Constr Req Tunable Trafo
## ntree integer - 500 1 to Inf - TRUE -
## mtry integer - - 1 to Inf - TRUE -
## replace logical - TRUE - - TRUE -
## classwt numericvector <NA> - 0 to Inf - TRUE -
## cutoff numericvector <NA> - 0 to 1 - TRUE -
## strata untyped - - - - FALSE -
## sampsize integervector <NA> - 1 to Inf - TRUE -
## nodesize integer - 1 1 to Inf - TRUE -
## maxnodes integer - - 1 to Inf - TRUE -
## importance logical - FALSE - - TRUE -
## localImp logical - FALSE - - TRUE -
## proximity logical - FALSE - - FALSE -
## oob.prox logical - - - Y FALSE -
## norm.votes logical - TRUE - - FALSE -
## do.trace logical - FALSE - - FALSE -
## keep.forest logical - TRUE - - FALSE -
## keep.inbag logical - FALSE - - FALSE -
trainTask <- makeClassifTask(data = train,target = "income")
testTask <- makeClassifTask(data = test3, target = "income")
rf <- makeLearner("classif.randomForest", predict.type = "response", par.vals = list(ntree = 200, mtry = 3))
rf_param <- makeParamSet(
makeIntegerParam("ntree",lower = 50, upper = 500),
makeIntegerParam("mtry", lower = 3, upper = 10),
makeIntegerParam("nodesize", lower = 10, upper = 50)
)
rancontrol <- makeTuneControlRandom(maxit = 50L)
set_cv <- makeResampleDesc("CV",iters = 4L)
rf_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol, measures = acc)
## [Tune] Started tuning learner classif.randomForest for parameter set:
## Type len Def Constr Req Tunable Trafo
## ntree integer - - 50 to 500 - TRUE -
## mtry integer - - 3 to 10 - TRUE -
## nodesize integer - - 10 to 50 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: ntree=233; mtry=9; nodesize=43
## [Tune-y] 1: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 2: ntree=133; mtry=10; nodesize=27
## [Tune-y] 2: acc.test.mean=0.859; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 3: ntree=72; mtry=6; nodesize=22
## [Tune-y] 3: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 4: ntree=452; mtry=7; nodesize=26
## [Tune-y] 4: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 5: ntree=482; mtry=5; nodesize=11
## [Tune-y] 5: acc.test.mean=0.858; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 6: ntree=66; mtry=8; nodesize=26
## [Tune-y] 6: acc.test.mean=0.858; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 7: ntree=460; mtry=7; nodesize=47
## [Tune-y] 7: acc.test.mean=0.861; time: 1.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 8: ntree=377; mtry=7; nodesize=42
## [Tune-y] 8: acc.test.mean=0.861; time: 1.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 9: ntree=199; mtry=6; nodesize=29
## [Tune-y] 9: acc.test.mean=0.86; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 10: ntree=226; mtry=3; nodesize=27
## [Tune-y] 10: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 11: ntree=221; mtry=5; nodesize=42
## [Tune-y] 11: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 12: ntree=61; mtry=7; nodesize=38
## [Tune-y] 12: acc.test.mean=0.86; time: 0.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 13: ntree=270; mtry=5; nodesize=19
## [Tune-y] 13: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 14: ntree=241; mtry=9; nodesize=24
## [Tune-y] 14: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 15: ntree=194; mtry=9; nodesize=15
## [Tune-y] 15: acc.test.mean=0.858; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 16: ntree=253; mtry=5; nodesize=27
## [Tune-y] 16: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 17: ntree=471; mtry=3; nodesize=23
## [Tune-y] 17: acc.test.mean=0.862; time: 1.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 18: ntree=161; mtry=10; nodesize=15
## [Tune-y] 18: acc.test.mean=0.857; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 19: ntree=209; mtry=7; nodesize=41
## [Tune-y] 19: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 20: ntree=435; mtry=9; nodesize=30
## [Tune-y] 20: acc.test.mean=0.86; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 21: ntree=140; mtry=4; nodesize=43
## [Tune-y] 21: acc.test.mean=0.862; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 22: ntree=154; mtry=6; nodesize=31
## [Tune-y] 22: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 23: ntree=136; mtry=5; nodesize=25
## [Tune-y] 23: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 24: ntree=457; mtry=8; nodesize=29
## [Tune-y] 24: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 25: ntree=263; mtry=6; nodesize=39
## [Tune-y] 25: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 26: ntree=316; mtry=4; nodesize=32
## [Tune-y] 26: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 27: ntree=97; mtry=10; nodesize=32
## [Tune-y] 27: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 28: ntree=122; mtry=9; nodesize=27
## [Tune-y] 28: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 29: ntree=259; mtry=6; nodesize=18
## [Tune-y] 29: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 30: ntree=242; mtry=6; nodesize=49
## [Tune-y] 30: acc.test.mean=0.861; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 31: ntree=227; mtry=8; nodesize=32
## [Tune-y] 31: acc.test.mean=0.859; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 32: ntree=469; mtry=9; nodesize=41
## [Tune-y] 32: acc.test.mean=0.861; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 33: ntree=317; mtry=4; nodesize=40
## [Tune-y] 33: acc.test.mean=0.861; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 34: ntree=170; mtry=4; nodesize=33
## [Tune-y] 34: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 35: ntree=473; mtry=7; nodesize=16
## [Tune-y] 35: acc.test.mean=0.859; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 36: ntree=479; mtry=6; nodesize=26
## [Tune-y] 36: acc.test.mean=0.86; time: 1.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 37: ntree=347; mtry=10; nodesize=17
## [Tune-y] 37: acc.test.mean=0.858; time: 1.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 38: ntree=67; mtry=4; nodesize=48
## [Tune-y] 38: acc.test.mean=0.861; time: 0.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 39: ntree=326; mtry=5; nodesize=19
## [Tune-y] 39: acc.test.mean=0.86; time: 1.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 40: ntree=492; mtry=3; nodesize=32
## [Tune-y] 40: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 41: ntree=242; mtry=4; nodesize=39
## [Tune-y] 41: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 42: ntree=271; mtry=4; nodesize=36
## [Tune-y] 42: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 43: ntree=288; mtry=9; nodesize=48
## [Tune-y] 43: acc.test.mean=0.861; time: 1.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 44: ntree=354; mtry=5; nodesize=19
## [Tune-y] 44: acc.test.mean=0.86; time: 1.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 45: ntree=87; mtry=7; nodesize=12
## [Tune-y] 45: acc.test.mean=0.856; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 46: ntree=205; mtry=10; nodesize=42
## [Tune-y] 46: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 47: ntree=74; mtry=8; nodesize=28
## [Tune-y] 47: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 48: ntree=92; mtry=5; nodesize=16
## [Tune-y] 48: acc.test.mean=0.858; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 49: ntree=117; mtry=4; nodesize=16
## [Tune-y] 49: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 50: ntree=398; mtry=8; nodesize=45
## [Tune-y] 50: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max
## [Tune] Result: ntree=140; mtry=4; nodesize=43 : acc.test.mean=0.862
rf_tune$x
## $ntree
## [1] 140
##
## $mtry
## [1] 4
##
## $nodesize
## [1] 43
rf.tree <- setHyperPars(rf, par.vals = rf_tune$x)
rforest <- train(rf.tree, trainTask)
rf_tune <- predict(rforest, testTask)
rf_con_tune <- confusionMatrix(rf_tune$data$response, test3$income)
rf_tune_ACU <- rf_con_tune$overall[1]
rf_VS_rfTuned<- data.frame(Model=c("Random Forest Tuned","Random Forest"),Accuracy=c(rf_tune_ACU,rfAcu))
dotplot(Model ~ Accuracy, data = rf_VS_rfTuned,
main = "RF VS. RF Tuned")
#After tuned we got much better Accuracy