Read data
Census <- read.csv("census.csv")
str(Census)
## 'data.frame': 31978 obs. of 13 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capitalgain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capitalloss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hoursperweek : int 40 13 40 40 40 40 16 45 50 40 ...
## $ nativecountry: Factor w/ 41 levels " Cambodia"," Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ over50k : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
Create test and training sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(2000)
Spl <- sample.split(Census$over50k, SplitRatio=.6)
Train <- subset(Census, Spl == TRUE)
Test <- subset(Census, Spl == FALSE)
Build logistic regression model using all other variables
CensusLog <- glm(over50k ~ ., data=Train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(CensusLog)
##
## Call:
## glm(formula = over50k ~ ., family = binomial, data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.1065 -0.5037 -0.1804 -0.0008 3.3383
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -8.658e+00 1.379e+00 -6.279
## age 2.548e-02 2.139e-03 11.916
## workclass Federal-gov 1.105e+00 2.014e-01 5.489
## workclass Local-gov 3.675e-01 1.821e-01 2.018
## workclass Never-worked -1.283e+01 8.453e+02 -0.015
## workclass Private 6.012e-01 1.626e-01 3.698
## workclass Self-emp-inc 7.575e-01 1.950e-01 3.884
## workclass Self-emp-not-inc 1.855e-01 1.774e-01 1.046
## workclass State-gov 4.012e-01 1.961e-01 2.046
## workclass Without-pay -1.395e+01 6.597e+02 -0.021
## education 11th 2.225e-01 2.867e-01 0.776
## education 12th 6.380e-01 3.597e-01 1.774
## education 1st-4th -7.075e-01 7.760e-01 -0.912
## education 5th-6th -3.170e-01 4.880e-01 -0.650
## education 7th-8th -3.498e-01 3.126e-01 -1.119
## education 9th -1.258e-01 3.539e-01 -0.355
## education Assoc-acdm 1.602e+00 2.427e-01 6.601
## education Assoc-voc 1.541e+00 2.368e-01 6.506
## education Bachelors 2.177e+00 2.218e-01 9.817
## education Doctorate 2.761e+00 2.893e-01 9.544
## education HS-grad 1.006e+00 2.169e-01 4.638
## education Masters 2.421e+00 2.353e-01 10.289
## education Preschool -2.237e+01 6.864e+02 -0.033
## education Prof-school 2.938e+00 2.753e-01 10.672
## education Some-college 1.365e+00 2.195e-01 6.219
## maritalstatus Married-AF-spouse 2.540e+00 7.145e-01 3.555
## maritalstatus Married-civ-spouse 2.458e+00 3.573e-01 6.880
## maritalstatus Married-spouse-absent -9.486e-02 3.204e-01 -0.296
## maritalstatus Never-married -4.515e-01 1.139e-01 -3.962
## maritalstatus Separated 3.609e-02 1.984e-01 0.182
## maritalstatus Widowed 1.858e-01 1.962e-01 0.947
## occupation Adm-clerical 9.470e-02 1.288e-01 0.735
## occupation Armed-Forces -1.008e+00 1.487e+00 -0.677
## occupation Craft-repair 2.174e-01 1.109e-01 1.960
## occupation Exec-managerial 9.400e-01 1.138e-01 8.257
## occupation Farming-fishing -1.068e+00 1.908e-01 -5.599
## occupation Handlers-cleaners -6.237e-01 1.946e-01 -3.204
## occupation Machine-op-inspct -1.862e-01 1.376e-01 -1.353
## occupation Other-service -8.183e-01 1.641e-01 -4.987
## occupation Priv-house-serv -1.297e+01 2.267e+02 -0.057
## occupation Prof-specialty 6.331e-01 1.222e-01 5.180
## occupation Protective-serv 6.267e-01 1.710e-01 3.664
## occupation Sales 3.276e-01 1.175e-01 2.789
## occupation Tech-support 6.173e-01 1.533e-01 4.028
## occupation Transport-moving NA NA NA
## relationship Not-in-family 7.881e-01 3.530e-01 2.233
## relationship Other-relative -2.194e-01 3.137e-01 -0.699
## relationship Own-child -7.489e-01 3.507e-01 -2.136
## relationship Unmarried 7.041e-01 3.720e-01 1.893
## relationship Wife 1.324e+00 1.331e-01 9.942
## race Asian-Pac-Islander 4.830e-01 3.548e-01 1.361
## race Black 3.644e-01 2.882e-01 1.265
## race Other 2.204e-01 4.513e-01 0.488
## race White 4.108e-01 2.737e-01 1.501
## sex Male 7.729e-01 1.024e-01 7.545
## capitalgain 3.280e-04 1.372e-05 23.904
## capitalloss 6.445e-04 4.854e-05 13.277
## hoursperweek 2.897e-02 2.101e-03 13.791
## nativecountry Canada 2.593e-01 1.308e+00 0.198
## nativecountry China -9.695e-01 1.327e+00 -0.730
## nativecountry Columbia -1.954e+00 1.526e+00 -1.280
## nativecountry Cuba 5.735e-02 1.323e+00 0.043
## nativecountry Dominican-Republic -1.435e+01 3.092e+02 -0.046
## nativecountry Ecuador -3.550e-02 1.477e+00 -0.024
## nativecountry El-Salvador -6.095e-01 1.395e+00 -0.437
## nativecountry England -6.707e-02 1.327e+00 -0.051
## nativecountry France 5.301e-01 1.419e+00 0.374
## nativecountry Germany 5.474e-02 1.306e+00 0.042
## nativecountry Greece -2.646e+00 1.714e+00 -1.544
## nativecountry Guatemala -1.293e+01 3.345e+02 -0.039
## nativecountry Haiti -9.221e-01 1.615e+00 -0.571
## nativecountry Holand-Netherlands -1.282e+01 2.400e+03 -0.005
## nativecountry Honduras -9.584e-01 3.412e+00 -0.281
## nativecountry Hong -2.362e-01 1.492e+00 -0.158
## nativecountry Hungary 1.412e-01 1.555e+00 0.091
## nativecountry India -8.218e-01 1.314e+00 -0.625
## nativecountry Iran -3.299e-02 1.366e+00 -0.024
## nativecountry Ireland 1.579e-01 1.473e+00 0.107
## nativecountry Italy 6.100e-01 1.333e+00 0.458
## nativecountry Jamaica -2.279e-01 1.387e+00 -0.164
## nativecountry Japan 5.072e-01 1.375e+00 0.369
## nativecountry Laos -6.831e-01 1.661e+00 -0.411
## nativecountry Mexico -9.182e-01 1.303e+00 -0.705
## nativecountry Nicaragua -1.987e-01 1.507e+00 -0.132
## nativecountry Outlying-US(Guam-USVI-etc) -1.373e+01 8.502e+02 -0.016
## nativecountry Peru -9.660e-01 1.678e+00 -0.576
## nativecountry Philippines 4.393e-02 1.281e+00 0.034
## nativecountry Poland 2.410e-01 1.383e+00 0.174
## nativecountry Portugal 7.276e-01 1.477e+00 0.493
## nativecountry Puerto-Rico -5.769e-01 1.357e+00 -0.425
## nativecountry Scotland -1.188e+00 1.719e+00 -0.691
## nativecountry South -8.183e-01 1.341e+00 -0.610
## nativecountry Taiwan -2.590e-01 1.350e+00 -0.192
## nativecountry Thailand -1.693e+00 1.737e+00 -0.975
## nativecountry Trinadad&Tobago -1.346e+00 1.721e+00 -0.782
## nativecountry United-States -8.594e-02 1.269e+00 -0.068
## nativecountry Vietnam -1.008e+00 1.523e+00 -0.662
## nativecountry Yugoslavia 1.402e+00 1.648e+00 0.851
## Pr(>|z|)
## (Intercept) 3.41e-10 ***
## age < 2e-16 ***
## workclass Federal-gov 4.03e-08 ***
## workclass Local-gov 0.043641 *
## workclass Never-worked 0.987885
## workclass Private 0.000218 ***
## workclass Self-emp-inc 0.000103 ***
## workclass Self-emp-not-inc 0.295646
## workclass State-gov 0.040728 *
## workclass Without-pay 0.983134
## education 11th 0.437738
## education 12th 0.076064 .
## education 1st-4th 0.361897
## education 5th-6th 0.516008
## education 7th-8th 0.263152
## education 9th 0.722228
## education Assoc-acdm 4.10e-11 ***
## education Assoc-voc 7.74e-11 ***
## education Bachelors < 2e-16 ***
## education Doctorate < 2e-16 ***
## education HS-grad 3.52e-06 ***
## education Masters < 2e-16 ***
## education Preschool 0.973996
## education Prof-school < 2e-16 ***
## education Some-college 5.00e-10 ***
## maritalstatus Married-AF-spouse 0.000378 ***
## maritalstatus Married-civ-spouse 6.00e-12 ***
## maritalstatus Married-spouse-absent 0.767155
## maritalstatus Never-married 7.42e-05 ***
## maritalstatus Separated 0.855672
## maritalstatus Widowed 0.343449
## occupation Adm-clerical 0.462064
## occupation Armed-Forces 0.498170
## occupation Craft-repair 0.049972 *
## occupation Exec-managerial < 2e-16 ***
## occupation Farming-fishing 2.15e-08 ***
## occupation Handlers-cleaners 0.001353 **
## occupation Machine-op-inspct 0.176061
## occupation Other-service 6.14e-07 ***
## occupation Priv-house-serv 0.954385
## occupation Prof-specialty 2.22e-07 ***
## occupation Protective-serv 0.000248 ***
## occupation Sales 0.005282 **
## occupation Tech-support 5.63e-05 ***
## occupation Transport-moving NA
## relationship Not-in-family 0.025562 *
## relationship Other-relative 0.484263
## relationship Own-child 0.032716 *
## relationship Unmarried 0.058392 .
## relationship Wife < 2e-16 ***
## race Asian-Pac-Islander 0.173504
## race Black 0.206001
## race Other 0.625263
## race White 0.133356
## sex Male 4.52e-14 ***
## capitalgain < 2e-16 ***
## capitalloss < 2e-16 ***
## hoursperweek < 2e-16 ***
## nativecountry Canada 0.842879
## nativecountry China 0.465157
## nativecountry Columbia 0.200470
## nativecountry Cuba 0.965432
## nativecountry Dominican-Republic 0.962972
## nativecountry Ecuador 0.980829
## nativecountry El-Salvador 0.662181
## nativecountry England 0.959686
## nativecountry France 0.708642
## nativecountry Germany 0.966572
## nativecountry Greece 0.122527
## nativecountry Guatemala 0.969180
## nativecountry Haiti 0.568105
## nativecountry Holand-Netherlands 0.995736
## nativecountry Honduras 0.778775
## nativecountry Hong 0.874155
## nativecountry Hungary 0.927653
## nativecountry India 0.531661
## nativecountry Iran 0.980736
## nativecountry Ireland 0.914628
## nativecountry Italy 0.647194
## nativecountry Jamaica 0.869467
## nativecountry Japan 0.712179
## nativecountry Laos 0.680866
## nativecountry Mexico 0.481103
## nativecountry Nicaragua 0.895132
## nativecountry Outlying-US(Guam-USVI-etc) 0.987115
## nativecountry Peru 0.564797
## nativecountry Philippines 0.972640
## nativecountry Poland 0.861624
## nativecountry Portugal 0.622327
## nativecountry Puerto-Rico 0.670837
## nativecountry Scotland 0.489616
## nativecountry South 0.541809
## nativecountry Taiwan 0.847878
## nativecountry Thailand 0.329678
## nativecountry Trinadad&Tobago 0.434105
## nativecountry United-States 0.946020
## nativecountry Vietnam 0.507799
## nativecountry Yugoslavia 0.394874
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21175 on 19186 degrees of freedom
## Residual deviance: 12104 on 19090 degrees of freedom
## AIC: 12298
##
## Number of Fisher Scoring iterations: 15
Accuracy of model using threshold of 0.5
PredictLog <- predict(CensusLog, newdata=Test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
(confmat <- table(Test$over50k, PredictLog > 0.5))
##
## FALSE TRUE
## <=50K 9051 662
## >50K 1190 1888
N <- nrow(Test)
sum(diag(confmat)) / N
## [1] 0.8552107
Baseline accuracy:
(table(Train$over50k))
##
## <=50K >50K
## 14570 4617
9713 / N
## [1] 0.7593621
AUC for test set
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.3
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
pred <- prediction(PredictLog, Test$over50k)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.9061598
Build CART model using all defaults
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
CensusCART <- rpart(over50k ~ ., data=Train, method="class")
prp(CensusCART)
Accuracy using threshold of 0.5 (2 ways to do same thing):
PredictCART <- predict(CensusCART, newdata=Test, type="class")
confmat <- table(Test$over50k, PredictCART)
(sum(diag(confmat)) / N)
## [1] 0.8473927
PredictCART <- predict(CensusCART, newdata=Test)[,2]
(confmat <- table(Test$over50k, PredictCART > 0.5))
##
## FALSE TRUE
## <=50K 9243 470
## >50K 1482 1596
sum(diag(confmat)) / N
## [1] 0.8473927
Generate ROC curve
library(ROCR)
PredictROC <- predict(CensusCART, newdata=Test)
pred <- prediction(PredictROC[,2], Test$over50k)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
Compute AUC
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.8470256
Make a smaller training set before running random forest because a large training set will require a lot of memory:
set.seed(1)
TrainSmall <- Train[sample(nrow(Train), 2000),]
Run random forest using all independent variables
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
CensusForest <- randomForest(over50k ~ ., data=TrainSmall)
Accuracy of random forest, with threshold 0.5.
PredictForest <- predict(CensusForest, newdata=Test)
(confmat <- table(Test$over50k, PredictForest))
## PredictForest
## <=50K >50K
## <=50K 9586 127
## >50K 1985 1093
sum(diag(confmat)) / nrow(Test)
## [1] 0.8348839
Find out the number of times, aggregated over all of the trees in random forest model, that a certain variable is selected for a split:
vu <- varUsed(CensusForest, count=TRUE)
vusorted <- sort(vu, decreasing=FALSE, index.return=TRUE)
dotchart(vusorted$x, names(CensusForest$forest$xlevels[vusorted$ix]))
Impurity - how homogenous each bucket or leaf of the tree is.
varImpPlot(CensusForest)
Select cp parameter of our CART model using k-fold cross validation, with k=10 folds and cp values from 0.002 to 0.1 in 0.002 increments.
set.seed(2)
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 3.1.3
kfolds <- 10
numFolds <- trainControl(method="cv", number=kfolds)
cpGrid <- expand.grid(.cp=seq(0.002, 0.1, 0.002))
(cpRes <- train(over50k ~ ., data=Train, method="rpart", trControl=numFolds,
tuneGrid=cpGrid))
## CART
##
## 19187 samples
## 12 predictor
## 2 classes: ' <=50K', ' >50K'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 17268, 17269, 17268, 17268, 17268, 17269, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.002 0.8508375 0.55418892 0.0079783134 0.02572748
## 0.004 0.8483880 0.55337769 0.0090978011 0.02868258
## 0.006 0.8445315 0.54123823 0.0096617586 0.03233330
## 0.008 0.8435933 0.54042644 0.0094655587 0.03147254
## 0.010 0.8435933 0.54140269 0.0094655587 0.03136220
## 0.012 0.8435933 0.54140269 0.0094655587 0.03136220
## 0.014 0.8435933 0.54140269 0.0094655587 0.03136220
## 0.016 0.8428116 0.53681906 0.0103168751 0.03646328
## 0.018 0.8409353 0.52587747 0.0099473396 0.04023190
## 0.020 0.8401011 0.52059748 0.0082541575 0.03181982
## 0.022 0.8385896 0.50703191 0.0074531758 0.01922360
## 0.024 0.8385896 0.50703191 0.0074531758 0.01922360
## 0.026 0.8389023 0.50505134 0.0069620513 0.02144327
## 0.028 0.8389023 0.50505134 0.0069620513 0.02144327
## 0.030 0.8389023 0.50505134 0.0069620513 0.02144327
## 0.032 0.8362438 0.48904498 0.0058432854 0.01998146
## 0.034 0.8344199 0.47659522 0.0064123113 0.02443506
## 0.036 0.8330126 0.46868826 0.0056838776 0.02310071
## 0.038 0.8246742 0.43210364 0.0072908778 0.02753755
## 0.040 0.8240485 0.42930363 0.0065649913 0.02486563
## 0.042 0.8240485 0.42930363 0.0065649913 0.02486563
## 0.044 0.8240485 0.42930363 0.0065649913 0.02486563
## 0.046 0.8240485 0.42930363 0.0065649913 0.02486563
## 0.048 0.8240485 0.42930363 0.0065649913 0.02486563
## 0.050 0.8203998 0.40161015 0.0055555482 0.04997157
## 0.052 0.8161259 0.36716691 0.0070277942 0.06002374
## 0.054 0.8123214 0.32392652 0.0055354181 0.04550869
## 0.056 0.8123214 0.32392652 0.0055354181 0.04550869
## 0.058 0.8118524 0.30757892 0.0053132172 0.02549082
## 0.060 0.8118524 0.30757892 0.0053132172 0.02549082
## 0.062 0.8118524 0.30757892 0.0053132172 0.02549082
## 0.064 0.8095062 0.29517106 0.0063359730 0.03570646
## 0.066 0.8075771 0.28451429 0.0056423962 0.03432258
## 0.068 0.7981443 0.22868458 0.0055663233 0.03381141
## 0.070 0.7958514 0.21468972 0.0029555869 0.01620357
## 0.072 0.7958514 0.21468972 0.0029555869 0.01620357
## 0.074 0.7958514 0.21468972 0.0029555869 0.01620357
## 0.076 0.7728666 0.07956634 0.0174372406 0.10286117
## 0.078 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.080 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.082 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.084 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.086 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.088 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.090 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.092 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.094 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.096 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.098 0.7593684 0.00000000 0.0001912161 0.00000000
## 0.100 0.7593684 0.00000000 0.0001912161 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.002.
cpRes$bestTune
## cp
## 1 0.002
Fit a CART model to the training data using this value of cp. What is the prediction accuracy on the test set?
CensusCV <- rpart(over50k ~ ., data=Train, method="class", cp=cpRes$bestTune)
PredictCV <- predict(CensusCV, newdata=Test, type="class")
(confmat <- table(Test$over50k, PredictCV))
## PredictCV
## <=50K >50K
## <=50K 9178 535
## >50K 1240 1838
sum(diag(confmat)) / nrow(Test)
## [1] 0.8612306
Plot CART tree for new model
(summary(CensusCV))
## Call:
## rpart(formula = over50k ~ ., data = Train, method = "class",
## cp = cpRes$bestTune)
## n= 19187
##
## CP nsplit rel error xerror xstd
## 1 0.121832359 0 1.0000000 1.0000000 0.01282467
## 2 0.065627031 2 0.7563353 0.7686810 0.01164877
## 3 0.037470219 3 0.6907083 0.7115010 0.01130135
## 4 0.007580680 4 0.6532380 0.6560537 0.01093906
## 5 0.005956249 8 0.6229153 0.6419753 0.01084280
## 6 0.004331817 10 0.6110028 0.6343946 0.01079021
## 7 0.004223522 11 0.6066710 0.6235651 0.01071415
## 8 0.003465454 13 0.5982240 0.6166342 0.01066489
## 9 0.003248863 16 0.5869612 0.6116526 0.01062920
## 10 0.002165909 17 0.5837124 0.6086203 0.01060735
## 11 0.002000000 18 0.5815465 0.6045051 0.01057756
##
## Variable importance
## relationship maritalstatus capitalgain education occupation
## 24 24 12 10 10
## sex age hoursperweek capitalloss nativecountry
## 8 6 3 2 1
## workclass
## 1
##
## Node number 1: 19187 observations, complexity param=0.1218324
## predicted class= <=50K expected loss=0.2406317 P(node) =1
## class counts: 14570 4617
## probabilities: 0.759 0.241
## left son=2 (10478 obs) right son=3 (8709 obs)
## Primary splits:
## relationship splits as RLLLLR, improve=1391.9300, (0 missing)
## maritalstatus splits as LRRLLLL, improve=1375.6510, (0 missing)
## capitalgain < 5119 to the left, improve= 979.0779, (0 missing)
## education splits as LLLLLLLLLRRLRLRL, improve= 749.1199, (0 missing)
## occupation splits as LLLLRLLLLLRLLLL, improve= 694.4727, (0 missing)
## Surrogate splits:
## maritalstatus splits as LRRLLLL, agree=0.993, adj=0.984, (0 split)
## sex splits as LR, agree=0.685, adj=0.307, (0 split)
## age < 34.5 to the left, agree=0.647, adj=0.223, (0 split)
## occupation splits as LLLRRRLLLLRRLLR, agree=0.621, adj=0.165, (0 split)
## hoursperweek < 43.5 to the left, agree=0.604, adj=0.127, (0 split)
##
## Node number 2: 10478 observations, complexity param=0.03747022
## predicted class= <=50K expected loss=0.06699752 P(node) =0.5460989
## class counts: 9776 702
## probabilities: 0.933 0.067
## left son=4 (10297 obs) right son=5 (181 obs)
## Primary splits:
## capitalgain < 7565.5 to the left, improve=305.64730, (0 missing)
## education splits as LLLLLLLLLRRLRLRL, improve= 99.69222, (0 missing)
## occupation splits as LLLLRLLLLLRLLLL, improve= 80.12740, (0 missing)
## hoursperweek < 44.5 to the left, improve= 73.50345, (0 missing)
## age < 33.5 to the left, improve= 45.95414, (0 missing)
##
## Node number 3: 8709 observations, complexity param=0.1218324
## predicted class= <=50K expected loss=0.449535 P(node) =0.4539011
## class counts: 4794 3915
## probabilities: 0.550 0.450
## left son=6 (6158 obs) right son=7 (2551 obs)
## Primary splits:
## education splits as LLLLLLLLLRRLRLRL, improve=529.7869, (0 missing)
## occupation splits as LRRLRLLLLLRRRRL, improve=524.6148, (0 missing)
## capitalgain < 5095.5 to the left, improve=452.5539, (0 missing)
## capitalloss < 1782.5 to the left, improve=151.2895, (0 missing)
## age < 29.5 to the left, improve=132.6677, (0 missing)
## Surrogate splits:
## occupation splits as LLLLRLLLLLRLLLL, agree=0.792, adj=0.290, (0 split)
## capitalgain < 7493 to the left, agree=0.724, adj=0.057, (0 split)
## nativecountry splits as LLRLLLLLRLLLLL-RLLRRLLLRLLLLLRLLLLRRLLLLL, agree=0.715, adj=0.027, (0 split)
## capitalloss < 1894.5 to the left, agree=0.712, adj=0.018, (0 split)
## race splits as LRLLL, agree=0.709, adj=0.007, (0 split)
##
## Node number 4: 10297 observations
## predicted class= <=50K expected loss=0.05098572 P(node) =0.5366655
## class counts: 9772 525
## probabilities: 0.949 0.051
##
## Node number 5: 181 observations
## predicted class= >50K expected loss=0.02209945 P(node) =0.009433471
## class counts: 4 177
## probabilities: 0.022 0.978
##
## Node number 6: 6158 observations, complexity param=0.06562703
## predicted class= <=50K expected loss=0.3372848 P(node) =0.3209465
## class counts: 4081 2077
## probabilities: 0.663 0.337
## left son=12 (5847 obs) right son=13 (311 obs)
## Primary splits:
## capitalgain < 5095.5 to the left, improve=276.64820, (0 missing)
## occupation splits as LRLLRLLLLLRRRRL, improve=152.12600, (0 missing)
## education splits as LLLLLLLRR--R-L-R, improve=110.43240, (0 missing)
## age < 33.5 to the left, improve= 74.69885, (0 missing)
## capitalloss < 1846 to the left, improve= 71.75490, (0 missing)
##
## Node number 7: 2551 observations, complexity param=0.005956249
## predicted class= >50K expected loss=0.2794982 P(node) =0.1329546
## class counts: 713 1838
## probabilities: 0.279 0.721
## left son=14 (2151 obs) right son=15 (400 obs)
## Primary splits:
## capitalgain < 5095.5 to the left, improve=70.19274, (0 missing)
## occupation splits as LLRLRLLLL-RRRRL, improve=60.75980, (0 missing)
## hoursperweek < 31 to the left, improve=27.87522, (0 missing)
## capitalloss < 1782.5 to the left, improve=27.80474, (0 missing)
## age < 28.5 to the left, improve=22.51366, (0 missing)
## Surrogate splits:
## nativecountry splits as -LLLLLLLLLLR-L-LL-LLLLLL-L---LL-L-LLL-LLL, agree=0.844, adj=0.002, (0 split)
##
## Node number 12: 5847 observations, complexity param=0.00758068
## predicted class= <=50K expected loss=0.3027193 P(node) =0.3047376
## class counts: 4077 1770
## probabilities: 0.697 0.303
## left son=24 (3590 obs) right son=25 (2257 obs)
## Primary splits:
## occupation splits as LRLLRLLLLLRRRRL, improve=125.39530, (0 missing)
## education splits as LLLLLLLRR--R-L-R, improve= 94.31411, (0 missing)
## capitalloss < 1846 to the left, improve= 84.51002, (0 missing)
## age < 33.5 to the left, improve= 58.81017, (0 missing)
## workclass splits as LRLLLRLLL, improve= 42.38332, (0 missing)
## Surrogate splits:
## education splits as LLLLLLLRL--L-L-R, agree=0.657, adj=0.112, (0 split)
## workclass splits as LRRLLRLRL, agree=0.656, adj=0.109, (0 split)
## sex splits as RL, agree=0.637, adj=0.059, (0 split)
## relationship splits as L----R, agree=0.637, adj=0.058, (0 split)
## capitalloss < 1867.5 to the left, agree=0.617, adj=0.008, (0 split)
##
## Node number 13: 311 observations
## predicted class= >50K expected loss=0.01286174 P(node) =0.01620889
## class counts: 4 307
## probabilities: 0.013 0.987
##
## Node number 14: 2151 observations, complexity param=0.005956249
## predicted class= >50K expected loss=0.330079 P(node) =0.1121072
## class counts: 710 1441
## probabilities: 0.330 0.670
## left son=28 (407 obs) right son=29 (1744 obs)
## Primary splits:
## occupation splits as LLRLRLLLL-RRRRL, improve=56.62440, (0 missing)
## capitalloss < 1782.5 to the left, improve=41.78535, (0 missing)
## hoursperweek < 31 to the left, improve=26.08142, (0 missing)
## capitalgain < 3120 to the right, improve=17.16168, (0 missing)
## age < 61.5 to the right, improve=17.04793, (0 missing)
## Surrogate splits:
## workclass splits as LRR-RRRR-, agree=0.845, adj=0.182, (0 split)
## nativecountry splits as -RRRRRRRRRR--R-RR-RRRRRR-L---LR-R-RRL-RLL, agree=0.815, adj=0.020, (0 split)
## hoursperweek < 14.5 to the left, agree=0.814, adj=0.015, (0 split)
##
## Node number 15: 400 observations
## predicted class= >50K expected loss=0.0075 P(node) =0.02084745
## class counts: 3 397
## probabilities: 0.007 0.992
##
## Node number 24: 3590 observations, complexity param=0.003465454
## predicted class= <=50K expected loss=0.2206128 P(node) =0.1871059
## class counts: 2798 792
## probabilities: 0.779 0.221
## left son=48 (809 obs) right son=49 (2781 obs)
## Primary splits:
## education splits as LLLLLLLRR--R-L-R, improve=41.09426, (0 missing)
## capitalloss < 1782.5 to the left, improve=37.68295, (0 missing)
## occupation splits as L-LR-LLRLL----R, improve=28.21818, (0 missing)
## age < 30.5 to the left, improve=21.23068, (0 missing)
## nativecountry splits as -RLLRLLLRLRLLL--LRLRRLRRRLLLLLRLLLRLLLRRR, improve=17.57917, (0 missing)
## Surrogate splits:
## nativecountry splits as -RRRRLRRRRRRRR--LRLRRLRRRLRLRRRLRRRRRRRRR, agree=0.794, adj=0.088, (0 split)
## occupation splits as R-LR-RRRRR----R, agree=0.775, adj=0.001, (0 split)
##
## Node number 25: 2257 observations, complexity param=0.00758068
## predicted class= <=50K expected loss=0.4333186 P(node) =0.1176317
## class counts: 1279 978
## probabilities: 0.567 0.433
## left son=50 (561 obs) right son=51 (1696 obs)
## Primary splits:
## age < 33.5 to the left, improve=40.23592, (0 missing)
## capitalloss < 1846 to the left, improve=36.93343, (0 missing)
## occupation splits as -L--R-----RLLR-, improve=20.55118, (0 missing)
## education splits as LLRLLLLRR--L---R, improve=20.18098, (0 missing)
## hoursperweek < 33.5 to the left, improve=17.71678, (0 missing)
## Surrogate splits:
## nativecountry splits as RRRLRLRRRRRRL---LRRRLRRLLRL-LRRRRRLLRRRRR, agree=0.758, adj=0.025, (0 split)
## maritalstatus splits as -LR----, agree=0.752, adj=0.004, (0 split)
## race splits as RRRLR, agree=0.752, adj=0.004, (0 split)
##
## Node number 28: 407 observations, complexity param=0.004331817
## predicted class= <=50K expected loss=0.4324324 P(node) =0.02121228
## class counts: 231 176
## probabilities: 0.568 0.432
## left son=56 (379 obs) right son=57 (28 obs)
## Primary splits:
## capitalloss < 1794 to the left, improve=10.847490, (0 missing)
## hoursperweek < 32.5 to the left, improve= 6.976672, (0 missing)
## nativecountry splits as -LL-L--LL-R--R----LR-R---L---RR-L-LRL-LLR, improve= 6.064786, (0 missing)
## age < 57.5 to the right, improve= 5.129288, (0 missing)
## occupation splits as LR-R-LLRL-----L, improve= 4.227508, (0 missing)
##
## Node number 29: 1744 observations
## predicted class= >50K expected loss=0.274656 P(node) =0.09089488
## class counts: 479 1265
## probabilities: 0.275 0.725
##
## Node number 48: 809 observations
## predicted class= <=50K expected loss=0.08034611 P(node) =0.04216397
## class counts: 744 65
## probabilities: 0.920 0.080
##
## Node number 49: 2781 observations, complexity param=0.003465454
## predicted class= <=50K expected loss=0.2614168 P(node) =0.1449419
## class counts: 2054 727
## probabilities: 0.739 0.261
## left son=98 (2695 obs) right son=99 (86 obs)
## Primary splits:
## capitalloss < 1782.5 to the left, improve=32.003060, (0 missing)
## age < 29.5 to the left, improve=24.700110, (0 missing)
## occupation splits as R--R-LLRLL----R, improve=23.614270, (0 missing)
## hoursperweek < 31 to the left, improve=17.134710, (0 missing)
## nativecountry splits as -RLLRLLLRLRLLL--LRLRRRRRLLL-LRRRLLRLLLRRR, improve= 9.412996, (0 missing)
##
## Node number 50: 561 observations, complexity param=0.002165909
## predicted class= <=50K expected loss=0.2691622 P(node) =0.02923855
## class counts: 410 151
## probabilities: 0.731 0.269
## left son=100 (549 obs) right son=101 (12 obs)
## Primary splits:
## nativecountry splits as RRRLRL-LRRL-L---L-R-LL-RLLR-L---L-LLL-LL-, improve=10.282230, (0 missing)
## age < 27.5 to the left, improve= 9.426169, (0 missing)
## capitalloss < 1794 to the left, improve= 4.540000, (0 missing)
## workclass splits as -RR-LRLL-, improve= 4.531931, (0 missing)
## occupation splits as -L--R-----RLLR-, improve= 4.529803, (0 missing)
##
## Node number 51: 1696 observations, complexity param=0.00758068
## predicted class= <=50K expected loss=0.4876179 P(node) =0.08839318
## class counts: 869 827
## probabilities: 0.512 0.488
## left son=102 (1607 obs) right son=103 (89 obs)
## Primary splits:
## capitalloss < 1846 to the left, improve=30.06068, (0 missing)
## education splits as LLLLLLLRR--L---R, improve=21.06198, (0 missing)
## age < 61.5 to the right, improve=20.21799, (0 missing)
## hoursperweek < 33.5 to the left, improve=19.16285, (0 missing)
## occupation splits as -L--R-----RLLR-, improve=13.03722, (0 missing)
##
## Node number 56: 379 observations
## predicted class= <=50K expected loss=0.4010554 P(node) =0.01975296
## class counts: 227 152
## probabilities: 0.599 0.401
##
## Node number 57: 28 observations
## predicted class= >50K expected loss=0.1428571 P(node) =0.001459321
## class counts: 4 24
## probabilities: 0.143 0.857
##
## Node number 98: 2695 observations
## predicted class= <=50K expected loss=0.2478664 P(node) =0.1404597
## class counts: 2027 668
## probabilities: 0.752 0.248
##
## Node number 99: 86 observations, complexity param=0.003465454
## predicted class= >50K expected loss=0.3139535 P(node) =0.004482201
## class counts: 27 59
## probabilities: 0.314 0.686
## left son=198 (28 obs) right son=199 (58 obs)
## Primary splits:
## capitalloss < 1989.5 to the right, improve=24.499710, (0 missing)
## hoursperweek < 46.5 to the left, improve= 2.651758, (0 missing)
## workclass splits as RRL-RRR--, improve= 2.442533, (0 missing)
## education splits as -------RR--L---R, improve= 1.405456, (0 missing)
## age < 60 to the right, improve= 1.173496, (0 missing)
## Surrogate splits:
## hoursperweek < 27.5 to the left, agree=0.721, adj=0.143, (0 split)
## age < 60 to the right, agree=0.709, adj=0.107, (0 split)
## workclass splits as LRL-RRR--, agree=0.698, adj=0.071, (0 split)
## occupation splits as L--R-LRRR-----R, agree=0.698, adj=0.071, (0 split)
##
## Node number 100: 549 observations
## predicted class= <=50K expected loss=0.2550091 P(node) =0.02861312
## class counts: 409 140
## probabilities: 0.745 0.255
##
## Node number 101: 12 observations
## predicted class= >50K expected loss=0.08333333 P(node) =0.0006254235
## class counts: 1 11
## probabilities: 0.083 0.917
##
## Node number 102: 1607 observations, complexity param=0.00758068
## predicted class= <=50K expected loss=0.4654636 P(node) =0.08375463
## class counts: 859 748
## probabilities: 0.535 0.465
## left son=204 (758 obs) right son=205 (849 obs)
## Primary splits:
## education splits as LLLLLLLRR--L---R, improve=20.98485, (0 missing)
## age < 61.5 to the right, improve=19.96612, (0 missing)
## hoursperweek < 33.5 to the left, improve=18.93180, (0 missing)
## nativecountry splits as LRLLLLLLRRRL-----LLR-RL--L---LLRLLL-RLLLR, improve=11.64503, (0 missing)
## occupation splits as -L--R-----RLLR-, improve=11.62255, (0 missing)
## Surrogate splits:
## age < 52.5 to the right, agree=0.582, adj=0.115, (0 split)
## occupation splits as -L--R-----RLLR-, agree=0.569, adj=0.086, (0 split)
## hoursperweek < 33.5 to the left, agree=0.554, adj=0.055, (0 split)
## workclass splits as -RR-RRLLL, agree=0.546, adj=0.038, (0 split)
## nativecountry splits as LRLRLRLLLRRL-----LRL-LR--L---RRLRLL-RRRLR, agree=0.544, adj=0.034, (0 split)
##
## Node number 103: 89 observations
## predicted class= >50K expected loss=0.1123596 P(node) =0.004638557
## class counts: 10 79
## probabilities: 0.112 0.888
##
## Node number 198: 28 observations
## predicted class= <=50K expected loss=0.1428571 P(node) =0.001459321
## class counts: 24 4
## probabilities: 0.857 0.143
##
## Node number 199: 58 observations
## predicted class= >50K expected loss=0.05172414 P(node) =0.00302288
## class counts: 3 55
## probabilities: 0.052 0.948
##
## Node number 204: 758 observations
## predicted class= <=50K expected loss=0.3799472 P(node) =0.03950592
## class counts: 470 288
## probabilities: 0.620 0.380
##
## Node number 205: 849 observations, complexity param=0.004223522
## predicted class= >50K expected loss=0.4581861 P(node) =0.04424871
## class counts: 389 460
## probabilities: 0.458 0.542
## left son=410 (15 obs) right son=411 (834 obs)
## Primary splits:
## capitalloss < 1512 to the right, improve=8.965266, (0 missing)
## age < 63.5 to the right, improve=8.663766, (0 missing)
## hoursperweek < 33.5 to the left, improve=8.001880, (0 missing)
## workclass splits as -RR-RRLL-, improve=5.670179, (0 missing)
## nativecountry splits as -R-LRL-R-RR-------L--RL--L---LL-L---RLL-R, improve=5.106393, (0 missing)
##
## Node number 410: 15 observations
## predicted class= <=50K expected loss=0 P(node) =0.0007817793
## class counts: 15 0
## probabilities: 1.000 0.000
##
## Node number 411: 834 observations, complexity param=0.004223522
## predicted class= >50K expected loss=0.4484412 P(node) =0.04346693
## class counts: 374 460
## probabilities: 0.448 0.552
## left son=822 (60 obs) right son=823 (774 obs)
## Primary splits:
## hoursperweek < 33.5 to the left, improve=8.182485, (0 missing)
## age < 63.5 to the right, improve=7.641514, (0 missing)
## workclass splits as -RR-RRLL-, improve=5.132825, (0 missing)
## nativecountry splits as -R-LRL-R-RR-------L--RL--L---LL-L---RLL-R, improve=4.859651, (0 missing)
## capitalgain < 3120 to the right, improve=4.278247, (0 missing)
## Surrogate splits:
## nativecountry splits as -R-RLR-R-RR-------R--RR--L---RR-R---RRR-R, agree=0.932, adj=0.05, (0 split)
##
## Node number 822: 60 observations
## predicted class= <=50K expected loss=0.3 P(node) =0.003127117
## class counts: 42 18
## probabilities: 0.700 0.300
##
## Node number 823: 774 observations, complexity param=0.003248863
## predicted class= >50K expected loss=0.4289406 P(node) =0.04033981
## class counts: 332 442
## probabilities: 0.429 0.571
## left son=1646 (101 obs) right son=1647 (673 obs)
## Primary splits:
## workclass splits as -RR-RRLL-, improve=4.905793, (0 missing)
## age < 63.5 to the right, improve=4.901891, (0 missing)
## nativecountry splits as -R-L-L-R-RR-------L--RL--L---LL-L---RLL-R, improve=3.644741, (0 missing)
## capitalgain < 3120 to the right, improve=3.372428, (0 missing)
## race splits as LRRRR, improve=3.216752, (0 missing)
## Surrogate splits:
## capitalgain < 4699.5 to the right, agree=0.871, adj=0.01, (0 split)
## hoursperweek < 91.5 to the right, agree=0.871, adj=0.01, (0 split)
## nativecountry splits as -R-R-R-R-RR-------R--RR--R---RR-R---RLR-R, agree=0.871, adj=0.01, (0 split)
##
## Node number 1646: 101 observations
## predicted class= <=50K expected loss=0.4257426 P(node) =0.005263981
## class counts: 58 43
## probabilities: 0.574 0.426
##
## Node number 1647: 673 observations
## predicted class= >50K expected loss=0.4071322 P(node) =0.03507583
## class counts: 274 399
## probabilities: 0.407 0.593
## n= 19187
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 19187 4617 <=50K (0.75936832 0.24063168)
## 2) relationship= Not-in-family, Other-relative, Own-child, Unmarried 10478 702 <=50K (0.93300248 0.06699752)
## 4) capitalgain< 7565.5 10297 525 <=50K (0.94901428 0.05098572) *
## 5) capitalgain>=7565.5 181 4 >50K (0.02209945 0.97790055) *
## 3) relationship= Husband, Wife 8709 3915 <=50K (0.55046504 0.44953496)
## 6) education= 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, Assoc-acdm, Assoc-voc, HS-grad, Preschool, Some-college 6158 2077 <=50K (0.66271517 0.33728483)
## 12) capitalgain< 5095.5 5847 1770 <=50K (0.69728066 0.30271934)
## 24) occupation= ?, Armed-Forces, Craft-repair, Farming-fishing, Handlers-cleaners, Machine-op-inspct, Other-service, Priv-house-serv, Transport-moving 3590 792 <=50K (0.77938719 0.22061281)
## 48) education= 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, Preschool 809 65 <=50K (0.91965389 0.08034611) *
## 49) education= Assoc-acdm, Assoc-voc, HS-grad, Some-college 2781 727 <=50K (0.73858324 0.26141676)
## 98) capitalloss< 1782.5 2695 668 <=50K (0.75213358 0.24786642) *
## 99) capitalloss>=1782.5 86 27 >50K (0.31395349 0.68604651)
## 198) capitalloss>=1989.5 28 4 <=50K (0.85714286 0.14285714) *
## 199) capitalloss< 1989.5 58 3 >50K (0.05172414 0.94827586) *
## 25) occupation= Adm-clerical, Exec-managerial, Prof-specialty, Protective-serv, Sales, Tech-support 2257 978 <=50K (0.56668144 0.43331856)
## 50) age< 33.5 561 151 <=50K (0.73083779 0.26916221)
## 100) nativecountry= Columbia, Dominican-Republic, El-Salvador, Germany, Guatemala, Hong, Ireland, Italy, Laos, Mexico, Peru, Puerto-Rico, South, Taiwan, Thailand, United-States, Vietnam 549 140 <=50K (0.74499089 0.25500911) *
## 101) nativecountry= Cambodia, Canada, China, Cuba, England, France, India, Japan, Nicaragua 12 1 >50K (0.08333333 0.91666667) *
## 51) age>=33.5 1696 827 <=50K (0.51238208 0.48761792)
## 102) capitalloss< 1846 1607 748 <=50K (0.53453640 0.46546360)
## 204) education= 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, HS-grad 758 288 <=50K (0.62005277 0.37994723) *
## 205) education= Assoc-acdm, Assoc-voc, Some-college 849 389 >50K (0.45818610 0.54181390)
## 410) capitalloss>=1512 15 0 <=50K (1.00000000 0.00000000) *
## 411) capitalloss< 1512 834 374 >50K (0.44844125 0.55155875)
## 822) hoursperweek< 33.5 60 18 <=50K (0.70000000 0.30000000) *
## 823) hoursperweek>=33.5 774 332 >50K (0.42894057 0.57105943)
## 1646) workclass= Self-emp-not-inc, State-gov 101 43 <=50K (0.57425743 0.42574257) *
## 1647) workclass= Federal-gov, Local-gov, Private, Self-emp-inc 673 274 >50K (0.40713224 0.59286776) *
## 103) capitalloss>=1846 89 10 >50K (0.11235955 0.88764045) *
## 13) capitalgain>=5095.5 311 4 >50K (0.01286174 0.98713826) *
## 7) education= Bachelors, Doctorate, Masters, Prof-school 2551 713 >50K (0.27949824 0.72050176)
## 14) capitalgain< 5095.5 2151 710 >50K (0.33007903 0.66992097)
## 28) occupation= ?, Adm-clerical, Craft-repair, Farming-fishing, Handlers-cleaners, Machine-op-inspct, Other-service, Transport-moving 407 176 <=50K (0.56756757 0.43243243)
## 56) capitalloss< 1794 379 152 <=50K (0.59894459 0.40105541) *
## 57) capitalloss>=1794 28 4 >50K (0.14285714 0.85714286) *
## 29) occupation= Armed-Forces, Exec-managerial, Prof-specialty, Protective-serv, Sales, Tech-support 1744 479 >50K (0.27465596 0.72534404) *
## 15) capitalgain>=5095.5 400 3 >50K (0.00750000 0.99250000) *
prp(CensusCV)