library(rpart)
library(mlr)
## Loading required package: ParamHelpers
## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
setwd("/Volumes/Data/DATA")
adults <- read.csv('adult.csv')
summarizeColumns(adults)
## name type na mean disp median mad
## 1 age integer 0 38.58165 1.364043e+01 37 14.8260
## 2 workclass factor 0 NA 3.029698e-01 NA NA
## 3 fnlwgt integer 0 189778.36651 1.055500e+05 178356 88798.8444
## 4 education factor 0 NA 6.774976e-01 NA NA
## 5 education.num integer 0 10.08068 2.572720e+00 10 1.4826
## 6 marital.status factor 0 NA 5.400633e-01 NA NA
## 7 occupation factor 0 NA 8.728540e-01 NA NA
## 8 relationship factor 0 NA 5.948220e-01 NA NA
## 9 race factor 0 NA 1.457265e-01 NA NA
## 10 sex factor 0 NA 3.307945e-01 NA NA
## 11 capital.gain integer 0 1077.64884 7.385292e+03 0 0.0000
## 12 capital.loss integer 0 87.30383 4.029602e+02 0 0.0000
## 13 hours.per.week integer 0 40.43746 1.234743e+01 40 4.4478
## 14 native.country factor 0 NA 1.041430e-01 NA NA
## 15 income factor 0 NA 2.408096e-01 NA NA
## min max nlevs
## 1 17 90 0
## 2 7 22696 9
## 3 12285 1484705 0
## 4 51 10501 16
## 5 1 16 0
## 6 23 14976 7
## 7 9 4140 15
## 8 981 13193 6
## 9 271 27816 5
## 10 10771 21790 2
## 11 0 99999 0
## 12 0 4356 0
## 13 1 99 0
## 14 1 29170 42
## 15 7841 24720 2
adults$capital <- adults$capital.gain - adults$capital.loss # combine two features as one.
adults$capital.gain <- NULL # Delete the original one
adults$capital.loss <- NULL
classif.task <- makeClassifTask(id = 'income', data = adults, target = 'income', positive = '>50K')
classif.task
## Supervised task: income
## Type: classif
## Target: income
## Observations: 32561
## Features:
## numerics factors ordered
## 5 8 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K >50K
## 24720 7841
## Positive class: >50K
classif.task <- removeConstantFeatures(classif.task)
classif.task <- dropFeatures(classif.task, c('native.country'))
getTaskFeatureNames(classif.task)
## [1] "age" "workclass" "fnlwgt" "education"
## [5] "education.num" "marital.status" "occupation" "relationship"
## [9] "race" "sex" "hours.per.week" "capital"
classif.lrn <- makeLearner(cl = 'classif.rpart', predict.type = 'prob')
classif.lrn
## Learner classif.rpart from package rpart
## Type: classif
## Name: Decision Tree; Short name: rpart
## Class: classif.rpart
## Properties: twoclass,multiclass,missings,numerics,factors,ordered,prob,weights,featimp
## Predict-Type: prob
## Hyperparameters: xval=0
model <- train(classif.lrn, classif.task)
model
## Model for learner.id=classif.rpart; learner.class=classif.rpart
## Trained on: task.id = income; obs = 32561; features = 12
## Hyperparameters: xval=0
tree <- getLearnerModel(model)
tree
## n= 32561
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32561 7841 <=50K (0.75919044 0.24080956)
## 2) relationship=Not-in-family,Other-relative,Own-child,Unmarried 17800 1178 <=50K (0.93382022 0.06617978)
## 4) capital< 7073.5 17482 872 <=50K (0.95012012 0.04987988) *
## 5) capital>=7073.5 318 12 >50K (0.03773585 0.96226415) *
## 3) relationship=Husband,Wife 14761 6663 <=50K (0.54860782 0.45139218)
## 6) education=10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,HS-grad,Preschool,Some-college 10329 3456 <=50K (0.66540807 0.33459193)
## 12) capital< 5095.5 9807 2944 <=50K (0.69980626 0.30019374) *
## 13) capital>=5095.5 522 10 >50K (0.01915709 0.98084291) *
## 7) education=Bachelors,Doctorate,Masters,Prof-school 4432 1225 >50K (0.27639892 0.72360108) *
library(rpart.plot)
rpart.plot(tree)
pred <- predict(model, newdata = adults)
head(getPredictionProbabilities(pred))
## [1] 0.04987988 0.04987988 0.04987988 0.04987988 0.04987988 0.04987988
performance(pred)
## mmce
## 0.1554928
calculateConfusionMatrix(pred)
## predicted
## true <=50K >50K -err.-
## <=50K 23473 1247 1247
## >50K 3816 4025 3816
## -err.- 3816 1247 5063
pred$threshold
## <=50K >50K
## 0.5 0.5
pred2 <- setThreshold(pred, 0.25)
pred2$threshold
## <=50K >50K
## 0.75 0.25
performance(pred2)
## mmce
## 0.2758515
With the same model, but different threshold, we can get two different mmce. In the second prediction (pred2), my aim of changing the threshold to lower 25% than the previous one is to increase the accuracy of predicting the richer person.
To find the optimal threshold, we can do like this :
op <- generateThreshVsPerfData(pred, measures = mmce)
plotThreshVsPerf(op)
Resampling method is to draw samples from a training data and REFITTING a model of interest on each sample to obtain more information about the fitted model.
In predictive analytics, the thing we would like to access is to find and minimize the test error, not the training error.
set.seed(124)
rdesc <- makeResampleDesc("CV", iters = 3, predict = 'both') # Both means we can gain both test and training error rates
# CV is cross validation
r <- resample('classif.rpart', classif.task, rdesc)
## [Resample] cross-validation iter 1:
## mmce.test.mean=0.156
## [Resample] cross-validation iter 2:
## mmce.test.mean=0.156
## [Resample] cross-validation iter 3:
## mmce.test.mean=0.151
## [Resample] Aggr. Result: mmce.test.mean=0.154
r$aggr
## mmce.test.mean
## 0.1543258
r$measures.test
## iter mmce
## 1 1 0.1558872
## 2 2 0.1560859
## 3 3 0.1510042
r$measures.train
## iter mmce
## 1 1 0.1552955
## 2 2 0.1551962
## 3 3 0.1489381
When you change the set.seed, the mmce will change.
In the result of measure test, with three iterations have been dropped slightly, the prediction will get more percise at those every small changes.
Mai Nguyen