library(rpart)
library(mlr)
## Loading required package: ParamHelpers
## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

setwd("/Volumes/Data/DATA")
adults <- read.csv('adult.csv')
summarizeColumns(adults)
##              name    type na         mean         disp median        mad
## 1             age integer  0     38.58165 1.364043e+01     37    14.8260
## 2       workclass  factor  0           NA 3.029698e-01     NA         NA
## 3          fnlwgt integer  0 189778.36651 1.055500e+05 178356 88798.8444
## 4       education  factor  0           NA 6.774976e-01     NA         NA
## 5   education.num integer  0     10.08068 2.572720e+00     10     1.4826
## 6  marital.status  factor  0           NA 5.400633e-01     NA         NA
## 7      occupation  factor  0           NA 8.728540e-01     NA         NA
## 8    relationship  factor  0           NA 5.948220e-01     NA         NA
## 9            race  factor  0           NA 1.457265e-01     NA         NA
## 10            sex  factor  0           NA 3.307945e-01     NA         NA
## 11   capital.gain integer  0   1077.64884 7.385292e+03      0     0.0000
## 12   capital.loss integer  0     87.30383 4.029602e+02      0     0.0000
## 13 hours.per.week integer  0     40.43746 1.234743e+01     40     4.4478
## 14 native.country  factor  0           NA 1.041430e-01     NA         NA
## 15         income  factor  0           NA 2.408096e-01     NA         NA
##      min     max nlevs
## 1     17      90     0
## 2      7   22696     9
## 3  12285 1484705     0
## 4     51   10501    16
## 5      1      16     0
## 6     23   14976     7
## 7      9    4140    15
## 8    981   13193     6
## 9    271   27816     5
## 10 10771   21790     2
## 11     0   99999     0
## 12     0    4356     0
## 13     1      99     0
## 14     1   29170    42
## 15  7841   24720     2
adults$capital <- adults$capital.gain - adults$capital.loss # combine two features as one.
adults$capital.gain <- NULL # Delete the original one
adults$capital.loss <- NULL

Learning task

classif.task <- makeClassifTask(id = 'income', data = adults, target = 'income', positive = '>50K')
classif.task
## Supervised task: income
## Type: classif
## Target: income
## Observations: 32561
## Features:
## numerics  factors  ordered 
##        5        8        0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K  >50K 
## 24720  7841 
## Positive class: >50K
classif.task <- removeConstantFeatures(classif.task)
classif.task <- dropFeatures(classif.task, c('native.country'))
getTaskFeatureNames(classif.task)
##  [1] "age"            "workclass"      "fnlwgt"         "education"     
##  [5] "education.num"  "marital.status" "occupation"     "relationship"  
##  [9] "race"           "sex"            "hours.per.week" "capital"
classif.lrn <- makeLearner(cl = 'classif.rpart', predict.type = 'prob')
classif.lrn
## Learner classif.rpart from package rpart
## Type: classif
## Name: Decision Tree; Short name: rpart
## Class: classif.rpart
## Properties: twoclass,multiclass,missings,numerics,factors,ordered,prob,weights,featimp
## Predict-Type: prob
## Hyperparameters: xval=0
model <- train(classif.lrn, classif.task)
model
## Model for learner.id=classif.rpart; learner.class=classif.rpart
## Trained on: task.id = income; obs = 32561; features = 12
## Hyperparameters: xval=0
tree <- getLearnerModel(model)
tree
## n= 32561 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 32561 7841 <=50K (0.75919044 0.24080956)  
##    2) relationship=Not-in-family,Other-relative,Own-child,Unmarried 17800 1178 <=50K (0.93382022 0.06617978)  
##      4) capital< 7073.5 17482  872 <=50K (0.95012012 0.04987988) *
##      5) capital>=7073.5 318   12 >50K (0.03773585 0.96226415) *
##    3) relationship=Husband,Wife 14761 6663 <=50K (0.54860782 0.45139218)  
##      6) education=10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,HS-grad,Preschool,Some-college 10329 3456 <=50K (0.66540807 0.33459193)  
##       12) capital< 5095.5 9807 2944 <=50K (0.69980626 0.30019374) *
##       13) capital>=5095.5 522   10 >50K (0.01915709 0.98084291) *
##      7) education=Bachelors,Doctorate,Masters,Prof-school 4432 1225 >50K (0.27639892 0.72360108) *
library(rpart.plot)
rpart.plot(tree)

pred <- predict(model, newdata = adults)
head(getPredictionProbabilities(pred))
## [1] 0.04987988 0.04987988 0.04987988 0.04987988 0.04987988 0.04987988
performance(pred)
##      mmce 
## 0.1554928
calculateConfusionMatrix(pred)
##         predicted
## true     <=50K >50K -err.-
##   <=50K  23473 1247   1247
##   >50K    3816 4025   3816
##   -err.-  3816 1247   5063
pred$threshold
## <=50K  >50K 
##   0.5   0.5
pred2 <- setThreshold(pred, 0.25) 
pred2$threshold
## <=50K  >50K 
##  0.75  0.25
performance(pred2)
##      mmce 
## 0.2758515

With the same model, but different threshold, we can get two different mmce. In the second prediction (pred2), my aim of changing the threshold to lower 25% than the previous one is to increase the accuracy of predicting the richer person.

To find the optimal threshold, we can do like this :

 op <- generateThreshVsPerfData(pred, measures = mmce)
plotThreshVsPerf(op)

Resampling

Resampling method is to draw samples from a training data and REFITTING a model of interest on each sample to obtain more information about the fitted model.

In predictive analytics, the thing we would like to access is to find and minimize the test error, not the training error.

set.seed(124)
rdesc <- makeResampleDesc("CV", iters = 3, predict = 'both') # Both means we can gain both test and training error rates
# CV is cross validation
r <- resample('classif.rpart', classif.task, rdesc)
## [Resample] cross-validation iter 1:
## mmce.test.mean=0.156
## [Resample] cross-validation iter 2:
## mmce.test.mean=0.156
## [Resample] cross-validation iter 3:
## mmce.test.mean=0.151
## [Resample] Aggr. Result: mmce.test.mean=0.154
r$aggr
## mmce.test.mean 
##      0.1543258
r$measures.test
##   iter      mmce
## 1    1 0.1558872
## 2    2 0.1560859
## 3    3 0.1510042
r$measures.train
##   iter      mmce
## 1    1 0.1552955
## 2    2 0.1551962
## 3    3 0.1489381

When you change the set.seed, the mmce will change.

In the result of measure test, with three iterations have been dropped slightly, the prediction will get more percise at those every small changes.

Mai Nguyen