Objective:

Predict if people in the data set belong in a certain class by salary, either making <=50k or >50k per year by comparing three models : Logistics Regression, Decision Tree and RandomForest.

Import Data

suppressWarnings(library(readr))
adult <- read_csv("C:/Users/dannyhuang/Desktop/adult.csv")

## Parsed with column specification:
## cols(
##   age = col_integer(),
##   workclass = col_character(),
##   fnlwgt = col_integer(),
##   education = col_character(),
##   `educational-num` = col_integer(),
##   `marital-status` = col_character(),
##   occupation = col_character(),
##   relationship = col_character(),
##   race = col_character(),
##   gender = col_character(),
##   `capital-gain` = col_integer(),
##   `capital-loss` = col_integer(),
##   `hours-per-week` = col_integer(),
##   `native-country` = col_character(),
##   income = col_character()
## )

head(adult,10)

## # A tibble: 10 × 15
##      age        workclass fnlwgt    education `educational-num`
##    <int>            <chr>  <int>        <chr>             <int>
## 1     25          Private 226802         11th                 7
## 2     38          Private  89814      HS-grad                 9
## 3     28        Local-gov 336951   Assoc-acdm                12
## 4     44          Private 160323 Some-college                10
## 5     18                ? 103497 Some-college                10
## 6     34          Private 198693         10th                 6
## 7     29                ? 227026      HS-grad                 9
## 8     63 Self-emp-not-inc 104626  Prof-school                15
## 9     24          Private 369667 Some-college                10
## 10    55          Private 104996      7th-8th                 4
## # ... with 10 more variables: `marital-status` <chr>, occupation <chr>,
## #   relationship <chr>, race <chr>, gender <chr>, `capital-gain` <int>,
## #   `capital-loss` <int>, `hours-per-week` <int>, `native-country` <chr>,
## #   income <chr>

str(adult)

## Classes 'tbl_df', 'tbl' and 'data.frame':    48842 obs. of  15 variables:
##  $ age            : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass      : chr  "Private" "Private" "Local-gov" "Private" ...
##  $ fnlwgt         : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education      : chr  "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
##  $ educational-num: int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital-status : chr  "Never-married" "Married-civ-spouse" "Married-civ-spouse" "Married-civ-spouse" ...
##  $ occupation     : chr  "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
##  $ relationship   : chr  "Own-child" "Husband" "Husband" "Husband" ...
##  $ race           : chr  "Black" "White" "White" "Black" ...
##  $ gender         : chr  "Male" "Male" "Male" "Male" ...
##  $ capital-gain   : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital-loss   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native-country : chr  "United-States" "United-States" "United-States" "United-States" ...
##  $ income         : chr  "<=50K" "<=50K" ">50K" ">50K" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 15
##   .. ..$ age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ workclass      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ fnlwgt         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ education      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ educational-num: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ marital-status : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ occupation     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ relationship   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ race           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ capital-gain   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ capital-loss   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ hours-per-week : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ native-country : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ income         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

_Data Cleaning

table(adult$workclass)

## 
##                ?      Federal-gov        Local-gov     Never-worked 
##             2799             1432             3136               10 
##          Private     Self-emp-inc Self-emp-not-inc        State-gov 
##            33906             1695             3862             1981 
##      Without-pay 
##               21

### 2799 NULL Values

Combined ‘Never-worked’ and ‘Without-pay’ into Unemployed

unemployed <- function(job){
    job <- as.character(job)
    if (job=='Never-worked' | job=='Without-pay'){
        return('Unemployed')
    }else{
        return(job)
    }
}

adult$workclass <- sapply(adult$workclass,unemployed)
table(adult$workclass)

## 
##                ?      Federal-gov        Local-gov          Private 
##             2799             1432             3136            33906 
##     Self-emp-inc Self-emp-not-inc        State-gov       Unemployed 
##             1695             3862             1981               31

Combine State and Local gov jobs into a category called SL-gov and combine self-employed jobs into a category called self-emp

group_emp <- function(job){
    if (job=='Local-gov' | job=='State-gov'){
        return('SL-gov')
    }else if (job=='Self-emp-inc' | job=='Self-emp-not-inc'){
        return('self-emp')
    }else{
        return(job)
    }
}


adult$workclass <- sapply(adult$workclass,group_emp)
table(adult$workclass)

## 
##           ? Federal-gov     Private    self-emp      SL-gov  Unemployed 
##        2799        1432       33906        5557        5117          31

table(adult$`marital-status`)

## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  6633                    37                 22379 
## Married-spouse-absent         Never-married             Separated 
##                   628                 16117                  1530 
##               Widowed 
##                  1518

Reduce martial status group into three groups : Married, Not-Married and Never-Married

group_marital <- function(mar){
    mar <- as.character(mar)
    
    # Not-Married
    if (mar=='Separated' | mar=='Divorced' | mar=='Widowed'){
        return('Not-Married')
    
    # Never-Married   
    }else if(mar=='Never-married'){
        return(mar)
    
     #Married
    }else{
        return('Married')
    }
}

adult$`marital-status` <- sapply(adult$`marital-status` ,group_marital)
table(adult$`marital-status`)

## 
##       Married Never-married   Not-Married 
##         23044         16117          9681

Catgorized ‘country’ into region

table(adult$country)

## Warning: Unknown column 'country'

## < table of extent 0 >

unique(adult$`native-country`)

##  [1] "United-States"              "?"                         
##  [3] "Peru"                       "Guatemala"                 
##  [5] "Mexico"                     "Dominican-Republic"        
##  [7] "Ireland"                    "Germany"                   
##  [9] "Philippines"                "Thailand"                  
## [11] "Haiti"                      "El-Salvador"               
## [13] "Puerto-Rico"                "Vietnam"                   
## [15] "South"                      "Columbia"                  
## [17] "Japan"                      "India"                     
## [19] "Cambodia"                   "Poland"                    
## [21] "Laos"                       "England"                   
## [23] "Cuba"                       "Taiwan"                    
## [25] "Italy"                      "Canada"                    
## [27] "Portugal"                   "China"                     
## [29] "Nicaragua"                  "Honduras"                  
## [31] "Iran"                       "Scotland"                  
## [33] "Jamaica"                    "Ecuador"                   
## [35] "Yugoslavia"                 "Hungary"                   
## [37] "Hong"                       "Greece"                    
## [39] "Trinadad&Tobago"            "Outlying-US(Guam-USVI-etc)"
## [41] "France"                     "Holand-Netherlands"

Asia <- c('China','Hong','India','Iran','Cambodia','Japan', 'Laos' ,
          'Philippines' ,'Vietnam' ,'Taiwan', 'Thailand')

North.America <- c('Canada','United-States','Puerto-Rico' )

Europe <- c('England' ,'France', 'Germany' ,'Greece','Holand-Netherlands','Hungary',
            'Ireland','Italy','Poland','Portugal','Scotland','Yugoslavia')

Latin.and.South.America <- c('Columbia','Cuba','Dominican-Republic','Ecuador',
                             'El-Salvador','Guatemala','Haiti','Honduras',
                             'Mexico','Nicaragua','Outlying-US(Guam-USVI-etc)','Peru',
                            'Jamaica','Trinadad&Tobago')
Other <- c('South')

group_country <- function(ctry){
    if (ctry %in% Asia){
        return('Asia')
    }else if (ctry %in% North.America){
        return('North.America')
    }else if (ctry %in% Europe){
        return('Europe')
    }else if (ctry %in% Latin.and.South.America){
        return('Latin.and.South.America')
    }else{
        return('Other')      
    }
}

adult$`native-country` <- sapply(adult$`native-country`,group_country)
table(adult$`native-country`)

## 
##                    Asia                  Europe Latin.and.South.America 
##                     981                     780                    1911 
##           North.America                   Other 
##                   44198                     972

Factorize the newly created column

str(adult)

## Classes 'tbl_df', 'tbl' and 'data.frame':    48842 obs. of  15 variables:
##  $ age            : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass      : chr  "Private" "Private" "SL-gov" "Private" ...
##  $ fnlwgt         : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education      : chr  "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
##  $ educational-num: int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital-status : chr  "Never-married" "Married" "Married" "Married" ...
##  $ occupation     : chr  "Machine-op-inspct" "Farming-fishing" "Protective-serv" "Machine-op-inspct" ...
##  $ relationship   : chr  "Own-child" "Husband" "Husband" "Husband" ...
##  $ race           : chr  "Black" "White" "White" "Black" ...
##  $ gender         : chr  "Male" "Male" "Male" "Male" ...
##  $ capital-gain   : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital-loss   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native-country : chr  "North.America" "North.America" "North.America" "North.America" ...
##  $ income         : chr  "<=50K" "<=50K" ">50K" ">50K" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 15
##   .. ..$ age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ workclass      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ fnlwgt         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ education      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ educational-num: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ marital-status : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ occupation     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ relationship   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ race           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ capital-gain   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ capital-loss   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ hours-per-week : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ native-country : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ income         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)

Missing Data

adult[adult == '4'] <- NA

library(Amelia)

## Warning: package 'Amelia' was built under R version 3.3.2

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

table(adult$workclass)

## 
##     Private      SL-gov           ?    self-emp Federal-gov  Unemployed 
##       33906        5117        2799        5557        1432          31

Factorize all the column except INT column

adult$workclass <- sapply(adult$workclass,factor)
adult$`native-country` <- sapply(adult$`native-country`,factor)
adult$`marital-status` <- sapply(adult$`marital-status`,factor)
adult$occupation <- sapply(adult$occupation,factor)
adult$relationship <- sapply(adult$relationship,factor)
adult$income <- sapply(adult$income,factor)
adult$education<- sapply(adult$education,factor)
adult$gender<- sapply(adult$gender,factor)
adult$race<- sapply(adult$race,factor)

Omit the missing values

adult <- na.omit(adult)

Check the map again

missmap(adult)

## Warning in if (class(obj) == "amelia") {: 條件的長度 > 1，因此只能用其第一
## 元素

## Warning: Unknown column 'arguments'

## Warning: Unknown column 'arguments'

## Warning: Unknown column 'imputations'

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.3.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

A Quick View about hours-per-week by histogram

A Quick View about relationship between “region” and “income”

Preprocess data into Train and Test

library(caTools)

## Warning: package 'caTools' was built under R version 3.3.2

sample <- sample.split(adult$income, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE

# Training Data
train = subset(adult, sample == TRUE)

# Testing Data 
test = subset(adult, sample == FALSE) #for logistics model
test2= test # for Desicison Tree
test3=test # for RandomForst

Logistics Model

suppressWarnings(library(caret))

## Loading required package: lattice

glm_model = glm(income ~ ., family = binomial(logit), data = train)
test$predicted.income = predict(glm_model, newdata=test, type="response")
set.seed(1)
test$income_class <- ifelse(test$predicted.income > 0.5, ">50K","<=50K")
glm_con <-confusionMatrix(test$income_class,test$income)
glm_con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10077  1359
##      >50K    779  2127
##                                          
##                Accuracy : 0.8509         
##                  95% CI : (0.845, 0.8567)
##     No Information Rate : 0.7569         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5706         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9282         
##             Specificity : 0.6102         
##          Pos Pred Value : 0.8812         
##          Neg Pred Value : 0.7319         
##              Prevalence : 0.7569         
##          Detection Rate : 0.7026         
##    Detection Prevalence : 0.7974         
##       Balanced Accuracy : 0.7692         
##                                          
##        'Positive' Class : <=50K          
##

suppressWarnings(library(caTools))

Calculate the optimal probability threshold for Sensitivity

suppressWarnings(library(caTools))
colAUC(test$predicted.income,test$income, plotROC = TRUE)

##                     [,1]
## <=50K vs. >50K 0.9077893

Decision Tree

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.3.2

## Loading required package: rpart

## Warning: package 'rpart' was built under R version 3.3.2

suppressWarnings(library(rpart))
suppressWarnings(library(ROCR))

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

names(train) <- make.names(names(train))
names(test2) <- make.names(names(test2))
tree_model <- rpart(income ~ ., train, method = "class")

all_probs <- predict(tree_model, test2, type = "prob")

test2$income_class <- ifelse(all_probs[,1]>0.5,"<=50K",">50K")
dt_con <- confusionMatrix(test2$income_class,test2$income)
dt_con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10281  1663
##      >50K    575  1823
##                                           
##                Accuracy : 0.844           
##                  95% CI : (0.8379, 0.8499)
##     No Information Rate : 0.7569          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5257          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9470          
##             Specificity : 0.5229          
##          Pos Pred Value : 0.8608          
##          Neg Pred Value : 0.7602          
##              Prevalence : 0.7569          
##          Detection Rate : 0.7168          
##    Detection Prevalence : 0.8328          
##       Balanced Accuracy : 0.7350          
##                                           
##        'Positive' Class : <=50K           
##

RandomForest

suppressWarnings(library(randomForest))

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

names(train) <- make.names(names(train))
names(test3) <- make.names(names(test3))
set.seed(32423)
rfFit<- randomForest(income~.,data= train)

print(rfFit)

## 
## Call:
##  randomForest(formula = income ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.72%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23777 1554  0.06134776
## >50K   3038 5095  0.37353990

rf_pred <- predict(rfFit,test3,type = "class")
rf_con<- confusionMatrix(rf_pred, test3$income)
rf_con

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 10181  1265
##      >50K    675  2221
##                                          
##                Accuracy : 0.8647         
##                  95% CI : (0.859, 0.8703)
##     No Information Rate : 0.7569         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.61           
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9378         
##             Specificity : 0.6371         
##          Pos Pred Value : 0.8895         
##          Neg Pred Value : 0.7669         
##              Prevalence : 0.7569         
##          Detection Rate : 0.7099         
##    Detection Prevalence : 0.7981         
##       Balanced Accuracy : 0.7875         
##                                          
##        'Positive' Class : <=50K          
##

Performance Comparison

glmAcu <- glm_con$overall[1]
dtAcu<- dt_con$overall[1]
rfAcu<- rf_con$overall[1]

ACU <- data.frame(Model=c("Decision Tree","Logistic Regression","Random Forest"),Accuracy=c(dtAcu,glmAcu,rfAcu))
ggplot(ACU,aes(x=Model,y=Accuracy,fill=Model))+geom_bar(stat = 'identity')+theme_bw()+ggtitle('Accuracies of Models')

#It shows that, based on Accuracy, Random Forest Model is a relatively better.

Tune Random Forest

suppressWarnings(library(mlr))

## Loading required package: ParamHelpers

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:ROCR':
## 
##     performance

## The following object is masked from 'package:caret':
## 
##     train

getParamSet("classif.randomForest")

##                      Type  len   Def   Constr Req Tunable Trafo
## ntree             integer    -   500 1 to Inf   -    TRUE     -
## mtry              integer    -     - 1 to Inf   -    TRUE     -
## replace           logical    -  TRUE        -   -    TRUE     -
## classwt     numericvector <NA>     - 0 to Inf   -    TRUE     -
## cutoff      numericvector <NA>     -   0 to 1   -    TRUE     -
## strata            untyped    -     -        -   -   FALSE     -
## sampsize    integervector <NA>     - 1 to Inf   -    TRUE     -
## nodesize          integer    -     1 1 to Inf   -    TRUE     -
## maxnodes          integer    -     - 1 to Inf   -    TRUE     -
## importance        logical    - FALSE        -   -    TRUE     -
## localImp          logical    - FALSE        -   -    TRUE     -
## proximity         logical    - FALSE        -   -   FALSE     -
## oob.prox          logical    -     -        -   Y   FALSE     -
## norm.votes        logical    -  TRUE        -   -   FALSE     -
## do.trace          logical    - FALSE        -   -   FALSE     -
## keep.forest       logical    -  TRUE        -   -   FALSE     -
## keep.inbag        logical    - FALSE        -   -   FALSE     -

trainTask <- makeClassifTask(data = train,target = "income")
testTask <- makeClassifTask(data = test3, target = "income")

rf <- makeLearner("classif.randomForest", predict.type = "response", par.vals = list(ntree = 200, mtry = 3))
rf_param <- makeParamSet(
makeIntegerParam("ntree",lower = 50, upper = 500),
makeIntegerParam("mtry", lower = 3, upper = 10),
makeIntegerParam("nodesize", lower = 10, upper = 50)
)

rancontrol <- makeTuneControlRandom(maxit = 50L)
                                    
set_cv <- makeResampleDesc("CV",iters = 4L)

rf_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol, measures = acc)

## [Tune] Started tuning learner classif.randomForest for parameter set:

##             Type len Def    Constr Req Tunable Trafo
## ntree    integer   -   - 50 to 500   -    TRUE     -
## mtry     integer   -   -   3 to 10   -    TRUE     -
## nodesize integer   -   -  10 to 50   -    TRUE     -

## With control class: TuneControlRandom

## Imputation value: -0

## [Tune-x] 1: ntree=233; mtry=9; nodesize=43

## [Tune-y] 1: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 2: ntree=133; mtry=10; nodesize=27

## [Tune-y] 2: acc.test.mean=0.859; time: 0.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 3: ntree=72; mtry=6; nodesize=22

## [Tune-y] 3: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 4: ntree=452; mtry=7; nodesize=26

## [Tune-y] 4: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 5: ntree=482; mtry=5; nodesize=11

## [Tune-y] 5: acc.test.mean=0.858; time: 1.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 6: ntree=66; mtry=8; nodesize=26

## [Tune-y] 6: acc.test.mean=0.858; time: 0.3 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 7: ntree=460; mtry=7; nodesize=47

## [Tune-y] 7: acc.test.mean=0.861; time: 1.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 8: ntree=377; mtry=7; nodesize=42

## [Tune-y] 8: acc.test.mean=0.861; time: 1.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 9: ntree=199; mtry=6; nodesize=29

## [Tune-y] 9: acc.test.mean=0.86; time: 0.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 10: ntree=226; mtry=3; nodesize=27

## [Tune-y] 10: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 11: ntree=221; mtry=5; nodesize=42

## [Tune-y] 11: acc.test.mean=0.862; time: 0.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 12: ntree=61; mtry=7; nodesize=38

## [Tune-y] 12: acc.test.mean=0.86; time: 0.2 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 13: ntree=270; mtry=5; nodesize=19

## [Tune-y] 13: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 14: ntree=241; mtry=9; nodesize=24

## [Tune-y] 14: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 15: ntree=194; mtry=9; nodesize=15

## [Tune-y] 15: acc.test.mean=0.858; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 16: ntree=253; mtry=5; nodesize=27

## [Tune-y] 16: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 17: ntree=471; mtry=3; nodesize=23

## [Tune-y] 17: acc.test.mean=0.862; time: 1.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 18: ntree=161; mtry=10; nodesize=15

## [Tune-y] 18: acc.test.mean=0.857; time: 0.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 19: ntree=209; mtry=7; nodesize=41

## [Tune-y] 19: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 20: ntree=435; mtry=9; nodesize=30

## [Tune-y] 20: acc.test.mean=0.86; time: 1.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 21: ntree=140; mtry=4; nodesize=43

## [Tune-y] 21: acc.test.mean=0.862; time: 0.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 22: ntree=154; mtry=6; nodesize=31

## [Tune-y] 22: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 23: ntree=136; mtry=5; nodesize=25

## [Tune-y] 23: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 24: ntree=457; mtry=8; nodesize=29

## [Tune-y] 24: acc.test.mean=0.859; time: 1.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 25: ntree=263; mtry=6; nodesize=39

## [Tune-y] 25: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 26: ntree=316; mtry=4; nodesize=32

## [Tune-y] 26: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 27: ntree=97; mtry=10; nodesize=32

## [Tune-y] 27: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 28: ntree=122; mtry=9; nodesize=27

## [Tune-y] 28: acc.test.mean=0.859; time: 0.5 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 29: ntree=259; mtry=6; nodesize=18

## [Tune-y] 29: acc.test.mean=0.86; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 30: ntree=242; mtry=6; nodesize=49

## [Tune-y] 30: acc.test.mean=0.861; time: 0.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 31: ntree=227; mtry=8; nodesize=32

## [Tune-y] 31: acc.test.mean=0.859; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 32: ntree=469; mtry=9; nodesize=41

## [Tune-y] 32: acc.test.mean=0.861; time: 1.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 33: ntree=317; mtry=4; nodesize=40

## [Tune-y] 33: acc.test.mean=0.861; time: 1.0 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 34: ntree=170; mtry=4; nodesize=33

## [Tune-y] 34: acc.test.mean=0.86; time: 0.6 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 35: ntree=473; mtry=7; nodesize=16

## [Tune-y] 35: acc.test.mean=0.859; time: 1.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 36: ntree=479; mtry=6; nodesize=26

## [Tune-y] 36: acc.test.mean=0.86; time: 1.7 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 37: ntree=347; mtry=10; nodesize=17

## [Tune-y] 37: acc.test.mean=0.858; time: 1.6 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 38: ntree=67; mtry=4; nodesize=48

## [Tune-y] 38: acc.test.mean=0.861; time: 0.2 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 39: ntree=326; mtry=5; nodesize=19

## [Tune-y] 39: acc.test.mean=0.86; time: 1.2 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 40: ntree=492; mtry=3; nodesize=32

## [Tune-y] 40: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 41: ntree=242; mtry=4; nodesize=39

## [Tune-y] 41: acc.test.mean=0.862; time: 0.8 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 42: ntree=271; mtry=4; nodesize=36

## [Tune-y] 42: acc.test.mean=0.862; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 43: ntree=288; mtry=9; nodesize=48

## [Tune-y] 43: acc.test.mean=0.861; time: 1.2 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 44: ntree=354; mtry=5; nodesize=19

## [Tune-y] 44: acc.test.mean=0.86; time: 1.3 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 45: ntree=87; mtry=7; nodesize=12

## [Tune-y] 45: acc.test.mean=0.856; time: 0.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 46: ntree=205; mtry=10; nodesize=42

## [Tune-y] 46: acc.test.mean=0.86; time: 0.9 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 47: ntree=74; mtry=8; nodesize=28

## [Tune-y] 47: acc.test.mean=0.859; time: 0.3 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 48: ntree=92; mtry=5; nodesize=16

## [Tune-y] 48: acc.test.mean=0.858; time: 0.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 49: ntree=117; mtry=4; nodesize=16

## [Tune-y] 49: acc.test.mean=0.859; time: 0.4 min; memory: 239Mb use, 1828Mb max

## [Tune-x] 50: ntree=398; mtry=8; nodesize=45

## [Tune-y] 50: acc.test.mean=0.861; time: 1.5 min; memory: 239Mb use, 1828Mb max

## [Tune] Result: ntree=140; mtry=4; nodesize=43 : acc.test.mean=0.862

rf_tune$x

## $ntree
## [1] 140
## 
## $mtry
## [1] 4
## 
## $nodesize
## [1] 43

rf.tree <- setHyperPars(rf, par.vals = rf_tune$x)

rforest <- train(rf.tree, trainTask)

rf_tune <- predict(rforest, testTask)

rf_con_tune <- confusionMatrix(rf_tune$data$response, test3$income)

rf_tune_ACU <- rf_con_tune$overall[1]

rf_VS_rfTuned<- data.frame(Model=c("Random Forest Tuned","Random Forest"),Accuracy=c(rf_tune_ACU,rfAcu))

dotplot(Model ~ Accuracy, data = rf_VS_rfTuned,
  main = "RF VS. RF Tuned")

#After tuned we got much better Accuracy

Machine Learning : Whose income will be lower than 50K or higer than 50K

JHONG TING HUANG

2017年1月2日

Objective:

Import Data

_Data Cleaning

Combined ‘Never-worked’ and ‘Without-pay’ into Unemployed

Combine State and Local gov jobs into a category called SL-gov and combine self-employed jobs into a category called self-emp

Reduce martial status group into three groups : Married, Not-Married and Never-Married

Catgorized ‘country’ into region

Factorize the newly created column

Missing Data

Factorize all the column except INT column

Omit the missing values

Check the map again

A Quick View about hours-per-week by histogram

A Quick View about relationship between “region” and “income”

Preprocess data into Train and Test

Logistics Model

Calculate the optimal probability threshold for Sensitivity

Decision Tree

RandomForest

Performance Comparison

Tune Random Forest