Objective:

Predict if people in the data set belong in a certain class by salary, either making <=50k or >50k per year by comparing three models : Logistics Regression, Decision Tree and RandomForest.

Import Data

suppressWarnings(library(readr))
adult <- read_csv("C:/Users/dannyhuang/Desktop/adult.csv")
## Parsed with column specification:
## cols(
##   age = col_integer(),
##   workclass = col_character(),
##   fnlwgt = col_integer(),
##   education = col_character(),
##   `educational-num` = col_integer(),
##   `marital-status` = col_character(),
##   occupation = col_character(),
##   relationship = col_character(),
##   race = col_character(),
##   gender = col_character(),
##   `capital-gain` = col_integer(),
##   `capital-loss` = col_integer(),
##   `hours-per-week` = col_integer(),
##   `native-country` = col_character(),
##   income = col_character()
## )
head(adult,10)
## # A tibble: 10 × 15
##      age        workclass fnlwgt    education `educational-num`
##    <int>            <chr>  <int>        <chr>             <int>
## 1     25          Private 226802         11th                 7
## 2     38          Private  89814      HS-grad                 9
## 3     28        Local-gov 336951   Assoc-acdm                12
## 4     44          Private 160323 Some-college                10
## 5     18                ? 103497 Some-college                10
## 6     34          Private 198693         10th                 6
## 7     29                ? 227026      HS-grad                 9
## 8     63 Self-emp-not-inc 104626  Prof-school                15
## 9     24          Private 369667 Some-college                10
## 10    55          Private 104996      7th-8th                 4
## # ... with 10 more variables: `marital-status` <chr>, occupation <chr>,
## #   relationship <chr>, race <chr>, gender <chr>, `capital-gain` <int>,
## #   `capital-loss` <int>, `hours-per-week` <int>, `native-country` <chr>,
## #   income <chr>
str(adult)
## Classes 'tbl_df', 'tbl' and 'data.frame':    48842 obs. of  15 variables:
##  $ age            : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass      : chr  "Private" "Private" "Local-gov" "Private" ...
##  $ fnlwgt         : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education      : chr  "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
##  $ educational-num: int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital-status : chr  "Never-married" "Married-civ-spouse" "Married-civ-spouse" "Married-civ-spouse" ...
##  $ occupation     : chr  "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
##  $ relationship   : chr  "Own-child" "Husband" "Husband" "Husband" ...
##  $ race           : chr  "Black" "White" "White" "Black" ...
##  $ gender         : chr  "Male" "Male" "Male" "Male" ...
##  $ capital-gain   : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital-loss   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native-country : chr  "United-States" "United-States" "United-States" "United-States" ...
##  $ income         : chr  "<=50K" "<=50K" ">50K" ">50K" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 15
##   .. ..$ age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ workclass      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ fnlwgt         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ education      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ educational-num: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ marital-status : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ occupation     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ relationship   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ race           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ capital-gain   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ capital-loss   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ hours-per-week : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ native-country : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ income         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

_Data Cleaning

table(adult$workclass)
## 
##                ?      Federal-gov        Local-gov     Never-worked 
##             2799             1432             3136               10 
##          Private     Self-emp-inc Self-emp-not-inc        State-gov 
##            33906             1695             3862             1981 
##      Without-pay 
##               21
### 2799 NULL Values

Combined ‘Never-worked’ and ‘Without-pay’ into Unemployed

unemployed <- function(job){
    job <- as.character(job)
    if (job=='Never-worked' | job=='Without-pay'){
        return('Unemployed')
    }else{
        return(job)
    }
}
adult$workclass <- sapply(adult$workclass,unemployed)
table(adult$workclass)
## 
##                ?      Federal-gov        Local-gov          Private 
##             2799             1432             3136            33906 
##     Self-emp-inc Self-emp-not-inc        State-gov       Unemployed 
##             1695             3862             1981               31

Combine State and Local gov jobs into a category called SL-gov and combine self-employed jobs into a category called self-emp

group_emp <- function(job){
    if (job=='Local-gov' | job=='State-gov'){
        return('SL-gov')
    }else if (job=='Self-emp-inc' | job=='Self-emp-not-inc'){
        return('self-emp')
    }else{
        return(job)
    }
}


adult$workclass <- sapply(adult$workclass,group_emp)
table(adult$workclass)
## 
##           ? Federal-gov     Private    self-emp      SL-gov  Unemployed 
##        2799        1432       33906        5557        5117          31
table(adult$`marital-status`)
## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  6633                    37                 22379 
## Married-spouse-absent         Never-married             Separated 
##                   628                 16117                  1530 
##               Widowed 
##                  1518

Reduce martial status group into three groups : Married, Not-Married and Never-Married

group_marital <- function(mar){
    mar <- as.character(mar)
    
    # Not-Married
    if (mar=='Separated' | mar=='Divorced' | mar=='Widowed'){
        return('Not-Married')
    
    # Never-Married   
    }else if(mar=='Never-married'){
        return(mar)
    
     #Married
    }else{
        return('Married')
    }
}

adult$`marital-status` <- sapply(adult$`marital-status` ,group_marital)
table(adult$`marital-status`)
## 
##       Married Never-married   Not-Married 
##         23044         16117          9681

Catgorized ‘country’ into region

table(adult$country)
## Warning: Unknown column 'country'
## < table of extent 0 >
unique(adult$`native-country`)
##  [1] "United-States"              "?"                         
##  [3] "Peru"                       "Guatemala"                 
##  [5] "Mexico"                     "Dominican-Republic"        
##  [7] "Ireland"                    "Germany"                   
##  [9] "Philippines"                "Thailand"                  
## [11] "Haiti"                      "El-Salvador"               
## [13] "Puerto-Rico"                "Vietnam"                   
## [15] "South"                      "Columbia"                  
## [17] "Japan"                      "India"                     
## [19] "Cambodia"                   "Poland"                    
## [21] "Laos"                       "England"                   
## [23] "Cuba"                       "Taiwan"                    
## [25] "Italy"                      "Canada"                    
## [27] "Portugal"                   "China"                     
## [29] "Nicaragua"                  "Honduras"                  
## [31] "Iran"                       "Scotland"                  
## [33] "Jamaica"                    "Ecuador"                   
## [35] "Yugoslavia"                 "Hungary"                   
## [37] "Hong"                       "Greece"                    
## [39] "Trinadad&Tobago"            "Outlying-US(Guam-USVI-etc)"
## [41] "France"                     "Holand-Netherlands"
Asia <- c('China','Hong','India','Iran','Cambodia','Japan', 'Laos' ,
          'Philippines' ,'Vietnam' ,'Taiwan', 'Thailand')

North.America <- c('Canada','United-States','Puerto-Rico' )

Europe <- c('England' ,'France', 'Germany' ,'Greece','Holand-Netherlands','Hungary',
            'Ireland','Italy','Poland','Portugal','Scotland','Yugoslavia')

Latin.and.South.America <- c('Columbia','Cuba','Dominican-Republic','Ecuador',
                             'El-Salvador','Guatemala','Haiti','Honduras',
                             'Mexico','Nicaragua','Outlying-US(Guam-USVI-etc)','Peru',
                            'Jamaica','Trinadad&Tobago')
Other <- c('South')
group_country <- function(ctry){
    if (ctry %in% Asia){
        return('Asia')
    }else if (ctry %in% North.America){
        return('North.America')
    }else if (ctry %in% Europe){
        return('Europe')
    }else if (ctry %in% Latin.and.South.America){
        return('Latin.and.South.America')
    }else{
        return('Other')      
    }
}

adult$`native-country` <- sapply(adult$`native-country`,group_country)
table(adult$`native-country`)
## 
##                    Asia                  Europe Latin.and.South.America 
##                     981                     780                    1911 
##           North.America                   Other 
##                   44198                     972

Factorize the newly created column

str(adult)
## Classes 'tbl_df', 'tbl' and 'data.frame':    48842 obs. of  15 variables:
##  $ age            : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass      : chr  "Private" "Private" "SL-gov" "Private" ...
##  $ fnlwgt         : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education      : chr  "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
##  $ educational-num: int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital-status : chr  "Never-married" "Married" "Married" "Married" ...
##  $ occupation     : chr  "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
##  $ relationship   : chr  "Own-child" "Husband" "Husband" "Husband" ...
##  $ race           : chr  "Black" "White" "White" "Black" ...
##  $ gender         : chr  "Male" "Male" "Male" "Male" ...
##  $ capital-gain   : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital-loss   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native-country : chr  "North.America" "North.America" "North.America" "North.America" ...
##  $ income         : chr  "<=50K" "<=50K" ">50K" ">50K" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 15
##   .. ..$ age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ workclass      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ fnlwgt         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ education      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ educational-num: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ marital-status : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ occupation     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ relationship   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ race           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ capital-gain   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ capital-loss   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ hours-per-week : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ native-country : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ income         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)

Missing Data

adult[adult == '4'] <- NA
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.3.2
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
table(adult$workclass)
## 
##     Private      SL-gov           ?    self-emp Federal-gov  Unemployed 
##       33906        5117        2799        5557        1432          31

Factorize all the column except INT column

adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)
adult$occupation <- sapply(adult$occupation,factor)
adult$relationship <- sapply(adult$relationship,factor)
adult$income <- sapply(adult$income,factor)
adult$education<- sapply(adult$education,factor)
adult$gender<- sapply(adult$gender,factor)
adult$race<- sapply(adult$race,factor)

Omit the missing values

adult <- na.omit(adult)

Check the map again

missmap(adult)
## Warning in if (class(obj) == "amelia") {: 條件的長度 > 1,因此只能用其第一
## 元素
## Warning: Unknown column 'arguments'

## Warning: Unknown column 'arguments'
## Warning: Unknown column 'imputations'

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

A Quick View about hours-per-week by histogram

A Quick View about relationship between “region” and “income”

Preprocess data into Train and Test

library(caTools)
## Warning: package 'caTools' was built under R version 3.3.2
sample <- sample.split(adult$income, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE

# Training Data
train = subset(adult, sample == TRUE)

# Testing Data 
test = subset(adult, sample == FALSE) #for logistics model
test2= test # for Desicison Tree
test3=test # for RandomForst

Logistics Model

suppressWarnings(library(caret))
## Loading required package: lattice
glm_model = glm(income ~ ., family = binomial(logit), data = train)
test$predicted.income = predict(glm_model, newdata=test, type="response")
set.seed(1)
test$income_class <- ifelse(test$predicted.income > 0.5, ">50K","<=50K")
glm_con <-confusionMatrix(test$income_class,test$income)
glm_con
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10077  1359
##      >50K    779  2127
##                                          
##                Accuracy : 0.8509         
##                  95% CI : (0.845, 0.8567)
##     No Information Rate : 0.7569         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5706         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9282         
##             Specificity : 0.6102         
##          Pos Pred Value : 0.8812         
##          Neg Pred Value : 0.7319         
##              Prevalence : 0.7569         
##          Detection Rate : 0.7026         
##    Detection Prevalence : 0.7974         
##       Balanced Accuracy : 0.7692         
##                                          
##        'Positive' Class : <=50K          
## 
suppressWarnings(library(caTools))

Calculate the optimal probability threshold for Sensitivity

suppressWarnings(library(caTools))
colAUC(test$predicted.income,test$income, plotROC = TRUE)

##                     [,1]
## <=50K vs. >50K 0.9077893

Decision Tree

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.2
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.3.2
suppressWarnings(library(rpart))
suppressWarnings(library(ROCR))
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
names(train) <- make.names(names(train))
names(test2) <- make.names(names(test2))
tree_model <- rpart(income ~ ., train, method = "class")

all_probs <- predict(tree_model, test2, type = "prob")

test2$income_class <- ifelse(all_probs[,1]>0.5,"<=50K",">50K")
dt_con <- confusionMatrix(test2$income_class,test2$income)
dt_con 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10281  1663
##      >50K    575  1823
##                                           
##                Accuracy : 0.844           
##                  95% CI : (0.8379, 0.8499)
##     No Information Rate : 0.7569          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5257          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9470          
##             Specificity : 0.5229          
##          Pos Pred Value : 0.8608          
##          Neg Pred Value : 0.7602          
##              Prevalence : 0.7569          
##          Detection Rate : 0.7168          
##    Detection Prevalence : 0.8328          
##       Balanced Accuracy : 0.7350          
##                                           
##        'Positive' Class : <=50K           
## 

RandomForest

suppressWarnings(library(randomForest))
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
names(train) <- make.names(names(train))
names(test3) <- make.names(names(test3))
set.seed(32423)
rfFit<- randomForest(income~.,data= train)

print(rfFit)
## 
## Call:
##  randomForest(formula = income ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.72%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23777 1554  0.06134776
## >50K   3038 5095  0.37353990
rf_pred <- predict(rfFit,test3,type = "class")
rf_con<- confusionMatrix(rf_pred, test3$income)
rf_con
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10181  1265
##      >50K    675  2221
##                                          
##                Accuracy : 0.8647         
##                  95% CI : (0.859, 0.8703)
##     No Information Rate : 0.7569         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.61           
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9378         
##             Specificity : 0.6371         
##          Pos Pred Value : 0.8895         
##          Neg Pred Value : 0.7669         
##              Prevalence : 0.7569         
##          Detection Rate : 0.7099         
##    Detection Prevalence : 0.7981         
##       Balanced Accuracy : 0.7875         
##                                          
##        'Positive' Class : <=50K          
## 

Performance Comparison

glmAcu <- glm_con$overall[1]
dtAcu<- dt_con$overall[1]
rfAcu<- rf_con$overall[1]

ACU <- data.frame(Model=c("Decision Tree","Logistic Regression","Random Forest"),Accuracy=c(dtAcu,glmAcu,rfAcu))
ggplot(ACU,aes(x=Model,y=Accuracy,fill=Model))+geom_bar(stat = 'identity')+theme_bw()+ggtitle('Accuracies of Models')

#It shows that, based on Accuracy, Random Forest Model is a relatively better.

Tune Random Forest

suppressWarnings(library(mlr))
## Loading required package: ParamHelpers
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:ROCR':
## 
##     performance
## The following object is masked from 'package:caret':
## 
##     train
getParamSet("classif.randomForest")
##                      Type  len   Def   Constr Req Tunable Trafo
## ntree             integer    -   500 1 to Inf   -    TRUE     -
## mtry              integer    -     - 1 to Inf   -    TRUE     -
## replace           logical    -  TRUE        -   -    TRUE     -
## classwt     numericvector <NA>     - 0 to Inf   -    TRUE     -
## cutoff      numericvector <NA>     -   0 to 1   -    TRUE     -
## strata            untyped    -     -        -   -   FALSE     -
## sampsize    integervector <NA>     - 1 to Inf   -    TRUE     -
## nodesize          integer    -     1 1 to Inf   -    TRUE     -
## maxnodes          integer    -     - 1 to Inf   -    TRUE     -
## importance        logical    - FALSE        -   -    TRUE     -
## localImp          logical    - FALSE        -   -    TRUE     -
## proximity         logical    - FALSE        -   -   FALSE     -
## oob.prox          logical    -     -        -   Y   FALSE     -
## norm.votes        logical    -  TRUE        -   -   FALSE     -
## do.trace          logical    - FALSE        -   -   FALSE     -
## keep.forest       logical    -  TRUE        -   -   FALSE     -
## keep.inbag        logical    - FALSE        -   -   FALSE     -
trainTask <- makeClassifTask(data = train,target = "income")
testTask <- makeClassifTask(data = test3, target = "income")

rf <- makeLearner("classif.randomForest", predict.type = "response", par.vals = list(ntree = 200, mtry = 3))
rf_param <- makeParamSet(
makeIntegerParam("ntree",lower = 50, upper = 500),
makeIntegerParam("mtry", lower = 3, upper = 10),
makeIntegerParam("nodesize", lower = 10, upper = 50)
)

rancontrol <- makeTuneControlRandom(maxit = 50L)
                                    
set_cv <- makeResampleDesc("CV",iters = 4L)

rf_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol, measures = acc)
## [Tune] Started tuning learner classif.randomForest for parameter set:
##             Type len Def    Constr Req Tunable Trafo
## ntree    integer   -   - 50 to 500   -    TRUE     -
## mtry     integer   -   -   3 to 10   -    TRUE     -
## nodesize integer   -   -  10 to 50   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: ntree=233; mtry=9; nodesize=43
## [Tune-y] 1: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 2: ntree=133; mtry=10; nodesize=27
## [Tune-y] 2: acc.test.mean=0.859; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 3: ntree=72; mtry=6; nodesize=22
## [Tune-y] 3: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 4: ntree=452; mtry=7; nodesize=26
## [Tune-y] 4: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 5: ntree=482; mtry=5; nodesize=11
## [Tune-y] 5: acc.test.mean=0.858; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 6: ntree=66; mtry=8; nodesize=26
## [Tune-y] 6: acc.test.mean=0.858; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 7: ntree=460; mtry=7; nodesize=47
## [Tune-y] 7: acc.test.mean=0.861; time: 1.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 8: ntree=377; mtry=7; nodesize=42
## [Tune-y] 8: acc.test.mean=0.861; time: 1.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 9: ntree=199; mtry=6; nodesize=29
## [Tune-y] 9: acc.test.mean=0.86; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 10: ntree=226; mtry=3; nodesize=27
## [Tune-y] 10: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 11: ntree=221; mtry=5; nodesize=42
## [Tune-y] 11: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 12: ntree=61; mtry=7; nodesize=38
## [Tune-y] 12: acc.test.mean=0.86; time: 0.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 13: ntree=270; mtry=5; nodesize=19
## [Tune-y] 13: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 14: ntree=241; mtry=9; nodesize=24
## [Tune-y] 14: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 15: ntree=194; mtry=9; nodesize=15
## [Tune-y] 15: acc.test.mean=0.858; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 16: ntree=253; mtry=5; nodesize=27
## [Tune-y] 16: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 17: ntree=471; mtry=3; nodesize=23
## [Tune-y] 17: acc.test.mean=0.862; time: 1.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 18: ntree=161; mtry=10; nodesize=15
## [Tune-y] 18: acc.test.mean=0.857; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 19: ntree=209; mtry=7; nodesize=41
## [Tune-y] 19: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 20: ntree=435; mtry=9; nodesize=30
## [Tune-y] 20: acc.test.mean=0.86; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 21: ntree=140; mtry=4; nodesize=43
## [Tune-y] 21: acc.test.mean=0.862; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 22: ntree=154; mtry=6; nodesize=31
## [Tune-y] 22: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 23: ntree=136; mtry=5; nodesize=25
## [Tune-y] 23: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 24: ntree=457; mtry=8; nodesize=29
## [Tune-y] 24: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 25: ntree=263; mtry=6; nodesize=39
## [Tune-y] 25: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 26: ntree=316; mtry=4; nodesize=32
## [Tune-y] 26: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 27: ntree=97; mtry=10; nodesize=32
## [Tune-y] 27: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 28: ntree=122; mtry=9; nodesize=27
## [Tune-y] 28: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 29: ntree=259; mtry=6; nodesize=18
## [Tune-y] 29: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 30: ntree=242; mtry=6; nodesize=49
## [Tune-y] 30: acc.test.mean=0.861; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 31: ntree=227; mtry=8; nodesize=32
## [Tune-y] 31: acc.test.mean=0.859; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 32: ntree=469; mtry=9; nodesize=41
## [Tune-y] 32: acc.test.mean=0.861; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 33: ntree=317; mtry=4; nodesize=40
## [Tune-y] 33: acc.test.mean=0.861; time: 1.0 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 34: ntree=170; mtry=4; nodesize=33
## [Tune-y] 34: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 35: ntree=473; mtry=7; nodesize=16
## [Tune-y] 35: acc.test.mean=0.859; time: 1.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 36: ntree=479; mtry=6; nodesize=26
## [Tune-y] 36: acc.test.mean=0.86; time: 1.7 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 37: ntree=347; mtry=10; nodesize=17
## [Tune-y] 37: acc.test.mean=0.858; time: 1.6 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 38: ntree=67; mtry=4; nodesize=48
## [Tune-y] 38: acc.test.mean=0.861; time: 0.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 39: ntree=326; mtry=5; nodesize=19
## [Tune-y] 39: acc.test.mean=0.86; time: 1.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 40: ntree=492; mtry=3; nodesize=32
## [Tune-y] 40: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 41: ntree=242; mtry=4; nodesize=39
## [Tune-y] 41: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 42: ntree=271; mtry=4; nodesize=36
## [Tune-y] 42: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 43: ntree=288; mtry=9; nodesize=48
## [Tune-y] 43: acc.test.mean=0.861; time: 1.2 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 44: ntree=354; mtry=5; nodesize=19
## [Tune-y] 44: acc.test.mean=0.86; time: 1.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 45: ntree=87; mtry=7; nodesize=12
## [Tune-y] 45: acc.test.mean=0.856; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 46: ntree=205; mtry=10; nodesize=42
## [Tune-y] 46: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 47: ntree=74; mtry=8; nodesize=28
## [Tune-y] 47: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 48: ntree=92; mtry=5; nodesize=16
## [Tune-y] 48: acc.test.mean=0.858; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 49: ntree=117; mtry=4; nodesize=16
## [Tune-y] 49: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max
## [Tune-x] 50: ntree=398; mtry=8; nodesize=45
## [Tune-y] 50: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max
## [Tune] Result: ntree=140; mtry=4; nodesize=43 : acc.test.mean=0.862
rf_tune$x    
## $ntree
## [1] 140
## 
## $mtry
## [1] 4
## 
## $nodesize
## [1] 43
rf.tree <- setHyperPars(rf, par.vals = rf_tune$x)

rforest <- train(rf.tree, trainTask)

rf_tune <- predict(rforest, testTask)

rf_con_tune <- confusionMatrix(rf_tune$data$response, test3$income)

rf_tune_ACU <- rf_con_tune$overall[1]

rf_VS_rfTuned<- data.frame(Model=c("Random Forest Tuned","Random Forest"),Accuracy=c(rf_tune_ACU,rfAcu))

dotplot(Model ~ Accuracy, data = rf_VS_rfTuned,
  main = "RF VS. RF Tuned")

#After tuned we got much better Accuracy