主要議題:使用普查資料預測收入
學習重點:
caTools::colAUC()caTools::colAUC()rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(caTools)
source('DPP.R')
D = read.csv('data/census.csv')
table(D$over50)
<=50K >50K
24283 7695
table(D$over50) %>% prop.table # Acc.base = 0.7594
<=50K >50K
0.7594 0.2406
Let’s begin by building a logistic regression model to predict whether an individual’s earnings are above $50,000 (the variable “over50k”) using all of the other variables as independent variables. First, read the dataset census.csv into R.
Then, split the data randomly into a training set and a testing set, setting the seed to 2000 before creating the split. Split the data so that the training set contains 60% of the observations, while the testing set contains 40% of the observations.
Next, build a logistic regression model to predict the dependent variable “over50k”, using all of the other variables in the dataset as independent variables. Use the training set to build the model.
Which variables are significant, or have factors that are significant? (Use 0.1 as your significance threshold, so variables with a period or dot in the stars column should be counted too. You might see a warning message here - you can ignore it and proceed. This message is a warning that we might be overfitting our model to the training set.) Select all that apply.
set.seed(2000)
spl = sample.split(D$over50k, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
glm1 = glm(over50k ~ ., TR, family=binomial)
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm1)
Call:
glm(formula = over50k ~ ., family = binomial, data = TR)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.107 -0.504 -0.180 -0.001 3.338
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value
(Intercept) -8.6580686 1.3788706 -6.28
age 0.0254838 0.0021386 11.92
workclass Federal-gov 1.1054468 0.2013806 5.49
workclass Local-gov 0.3674591 0.1821340 2.02
workclass Never-worked -12.8346355 845.2523702 -0.02
workclass Private 0.6011672 0.1625780 3.70
workclass Self-emp-inc 0.7575120 0.1950482 3.88
workclass Self-emp-not-inc 0.1855059 0.1773792 1.05
workclass State-gov 0.4012276 0.1960758 2.05
workclass Without-pay -13.9465612 659.7417182 -0.02
education 11th 0.2224997 0.2867198 0.78
education 12th 0.6380314 0.3596574 1.77
education 1st-4th -0.7075223 0.7759998 -0.91
education 5th-6th -0.3169764 0.4880227 -0.65
education 7th-8th -0.3498391 0.3126433 -1.12
education 9th -0.1258224 0.3539479 -0.36
education Assoc-acdm 1.6018145 0.2426784 6.60
education Assoc-voc 1.5407709 0.2368386 6.51
education Bachelors 2.1771055 0.2217585 9.82
education Doctorate 2.7609054 0.2892933 9.54
education HS-grad 1.0059548 0.2168943 4.64
education Masters 2.4209952 0.2353036 10.29
education Preschool -22.3738158 686.3835140 -0.03
education Prof-school 2.9379640 0.2752976 10.67
education Some-college 1.3651010 0.2194962 6.22
maritalstatus Married-AF-spouse 2.5398125 0.7144642 3.55
maritalstatus Married-civ-spouse 2.4577534 0.3572546 6.88
maritalstatus Married-spouse-absent -0.0948616 0.3203725 -0.30
maritalstatus Never-married -0.4514599 0.1139338 -3.96
maritalstatus Separated 0.0360919 0.1984310 0.18
maritalstatus Widowed 0.1858398 0.1961635 0.95
occupation Adm-clerical 0.0947036 0.1287693 0.74
occupation Armed-Forces -1.0075457 1.4874332 -0.68
occupation Craft-repair 0.2173818 0.1108975 1.96
occupation Exec-managerial 0.9400239 0.1138446 8.26
occupation Farming-fishing -1.0682985 0.1907972 -5.60
occupation Handlers-cleaners -0.6236839 0.1946320 -3.20
occupation Machine-op-inspct -0.1861551 0.1375888 -1.35
occupation Other-service -0.8183427 0.1641061 -4.99
occupation Priv-house-serv -12.9680365 226.7111870 -0.06
occupation Prof-specialty 0.6331276 0.1222333 5.18
occupation Protective-serv 0.6267195 0.1710320 3.66
occupation Sales 0.3276305 0.1174584 2.79
occupation Tech-support 0.6172622 0.1532519 4.03
occupation Transport-moving NA NA NA
relationship Not-in-family 0.7881330 0.3529788 2.23
relationship Other-relative -0.2194104 0.3136846 -0.70
relationship Own-child -0.7488937 0.3506796 -2.14
relationship Unmarried 0.7040592 0.3719778 1.89
relationship Wife 1.3235292 0.1331228 9.94
race Asian-Pac-Islander 0.4829511 0.3548419 1.36
race Black 0.3644091 0.2881529 1.26
race Other 0.2204231 0.4513125 0.49
race White 0.4107806 0.2736717 1.50
sex Male 0.7729257 0.1024396 7.55
capitalgain 0.0003280 0.0000137 23.90
capitalloss 0.0006445 0.0000485 13.28
hoursperweek 0.0289687 0.0021006 13.79
nativecountry Canada 0.2592983 1.3081815 0.20
nativecountry China -0.9694567 1.3273303 -0.73
nativecountry Columbia -1.9536188 1.5260114 -1.28
nativecountry Cuba 0.0573462 1.3232329 0.04
nativecountry Dominican-Republic -14.3541804 309.1918510 -0.05
nativecountry Ecuador -0.0355005 1.4773834 -0.02
nativecountry El-Salvador -0.6094544 1.3949399 -0.44
nativecountry England -0.0670676 1.3268340 -0.05
nativecountry France 0.5300878 1.4185608 0.37
nativecountry Germany 0.0547429 1.3062787 0.04
nativecountry Greece -2.6462729 1.7136241 -1.54
nativecountry Guatemala -12.9256999 334.5490941 -0.04
nativecountry Haiti -0.9221282 1.6153771 -0.57
nativecountry Holand-Netherlands -12.8233705 2399.5450821 -0.01
nativecountry Honduras -0.9584148 3.4117488 -0.28
nativecountry Hong -0.2362308 1.4915130 -0.16
nativecountry Hungary 0.1412328 1.5554598 0.09
nativecountry India -0.8218220 1.3139233 -0.63
nativecountry Iran -0.0329858 1.3660665 -0.02
nativecountry Ireland 0.1578963 1.4728709 0.11
nativecountry Italy 0.6100024 1.3328606 0.46
nativecountry Jamaica -0.2279150 1.3868928 -0.16
nativecountry Japan 0.5072432 1.3748989 0.37
nativecountry Laos -0.6830937 1.6608892 -0.41
nativecountry Mexico -0.9181782 1.3032487 -0.70
nativecountry Nicaragua -0.1986816 1.5072985 -0.13
nativecountry Outlying-US(Guam-USVI-etc) -13.7304783 850.1773422 -0.02
nativecountry Peru -0.9659994 1.6778652 -0.58
nativecountry Philippines 0.0439341 1.2809516 0.03
nativecountry Poland 0.2410229 1.3827481 0.17
nativecountry Portugal 0.7275811 1.4771572 0.49
nativecountry Puerto-Rico -0.5768595 1.3573180 -0.42
nativecountry Scotland -1.1875885 1.7188532 -0.69
nativecountry South -0.8182850 1.3412764 -0.61
nativecountry Taiwan -0.2590169 1.3502647 -0.19
nativecountry Thailand -1.6932131 1.7370523 -0.97
nativecountry Trinadad&Tobago -1.3461940 1.7210641 -0.78
nativecountry United-States -0.0859373 1.2692747 -0.07
nativecountry Vietnam -1.0084987 1.5227937 -0.66
nativecountry Yugoslavia 1.4017916 1.6475929 0.85
Pr(>|z|)
(Intercept) 0.000000000340535 ***
age < 2e-16 ***
workclass Federal-gov 0.000000040343445 ***
workclass Local-gov 0.04364 *
workclass Never-worked 0.98789
workclass Private 0.00022 ***
workclass Self-emp-inc 0.00010 ***
workclass Self-emp-not-inc 0.29565
workclass State-gov 0.04073 *
workclass Without-pay 0.98313
education 11th 0.43774
education 12th 0.07606 .
education 1st-4th 0.36190
education 5th-6th 0.51601
education 7th-8th 0.26315
education 9th 0.72223
education Assoc-acdm 0.000000000040960 ***
education Assoc-voc 0.000000000077398 ***
education Bachelors < 2e-16 ***
education Doctorate < 2e-16 ***
education HS-grad 0.000003518059170 ***
education Masters < 2e-16 ***
education Preschool 0.97400
education Prof-school < 2e-16 ***
education Some-college 0.000000000499549 ***
maritalstatus Married-AF-spouse 0.00038 ***
maritalstatus Married-civ-spouse 0.000000000006004 ***
maritalstatus Married-spouse-absent 0.76716
maritalstatus Never-married 0.000074177081437 ***
maritalstatus Separated 0.85567
maritalstatus Widowed 0.34345
occupation Adm-clerical 0.46206
occupation Armed-Forces 0.49817
occupation Craft-repair 0.04997 *
occupation Exec-managerial < 2e-16 ***
occupation Farming-fishing 0.000000021542855 ***
occupation Handlers-cleaners 0.00135 **
occupation Machine-op-inspct 0.17606
occupation Other-service 0.000000614290460 ***
occupation Priv-house-serv 0.95439
occupation Prof-specialty 0.000000222286503 ***
occupation Protective-serv 0.00025 ***
occupation Sales 0.00528 **
occupation Tech-support 0.000056310004688 ***
occupation Transport-moving NA
relationship Not-in-family 0.02556 *
relationship Other-relative 0.48426
relationship Own-child 0.03272 *
relationship Unmarried 0.05839 .
relationship Wife < 2e-16 ***
race Asian-Pac-Islander 0.17350
race Black 0.20600
race Other 0.62526
race White 0.13336
sex Male 0.000000000000045 ***
capitalgain < 2e-16 ***
capitalloss < 2e-16 ***
hoursperweek < 2e-16 ***
nativecountry Canada 0.84288
nativecountry China 0.46516
nativecountry Columbia 0.20047
nativecountry Cuba 0.96543
nativecountry Dominican-Republic 0.96297
nativecountry Ecuador 0.98083
nativecountry El-Salvador 0.66218
nativecountry England 0.95969
nativecountry France 0.70864
nativecountry Germany 0.96657
nativecountry Greece 0.12253
nativecountry Guatemala 0.96918
nativecountry Haiti 0.56811
nativecountry Holand-Netherlands 0.99574
nativecountry Honduras 0.77877
nativecountry Hong 0.87415
nativecountry Hungary 0.92765
nativecountry India 0.53166
nativecountry Iran 0.98074
nativecountry Ireland 0.91463
nativecountry Italy 0.64719
nativecountry Jamaica 0.86947
nativecountry Japan 0.71218
nativecountry Laos 0.68087
nativecountry Mexico 0.48110
nativecountry Nicaragua 0.89513
nativecountry Outlying-US(Guam-USVI-etc) 0.98711
nativecountry Peru 0.56480
nativecountry Philippines 0.97264
nativecountry Poland 0.86162
nativecountry Portugal 0.62233
nativecountry Puerto-Rico 0.67084
nativecountry Scotland 0.48962
nativecountry South 0.54181
nativecountry Taiwan 0.84788
nativecountry Thailand 0.32968
nativecountry Trinadad&Tobago 0.43410
nativecountry United-States 0.94602
nativecountry Vietnam 0.50780
nativecountry Yugoslavia 0.39487
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 21175 on 19186 degrees of freedom
Residual deviance: 12104 on 19090 degrees of freedom
AIC: 12298
Number of Fisher Scoring iterations: 15
What is the accuracy of the model on the testing set? Use a threshold of 0.5.
p.glm = pred = predict(glm1, TS, 'response')
prediction from a rank-deficient fit may be misleading
table(TS$over50k, pred > 0.5)
FALSE TRUE
<=50K 9051 662
>50K 1190 1888
table(TS$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)} # 0.8552
[1] 0.8552
What is the baseline accuracy for the testing set?
mean(TS$over50k == " <=50K")
[1] 0.7594
What is the area-under-the-curve (AUC) for this model on the test set?
colAUC(pred, TS$over50k)
[,1]
<=50K vs. >50K 0.9062
We have just seen how the logistic regression model for this data achieves a high accuracy. Moreover, the significances of the variables give us a way to gauge which variables are relevant for this prediction task. However, it is not immediately clear which variables are more important than the others, especially due to the large number of factor variables in this problem.
Let us now build a classification tree to predict “over50k”. Use the training set to build the model, and all of the other variables as independent variables. Use the default parameters, so don’t set a value for minbucket or cp. Remember to specify method=“class” as an argument to rpart, since this is a classification problem. After you are done building the model, plot the resulting tree.
How many splits does the tree have in total?
cart1 = rpart(over50k ~ ., TR, method='class')
prp(cart1, cex=0.75)
Which variable does the tree split on at the first level (the very first split of the tree)?
Which variables does the tree split on at the second level (immediately after the first split of the tree)? Select all that apply.
What is the accuracy of the model on the testing set? Use a threshold of 0.5. (You can either add the argument type=“class”, or generate probabilities and use a threshold of 0.5 like in logistic regression.)
p.cart = pred = predict(cart1, TS)[,2]
table(TS$over50k, pred > 0.5)
FALSE TRUE
<=50K 9243 470
>50K 1482 1596
table(TS$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)} # 0.8474
[1] 0.8474
Let us now consider the ROC curve and AUC for the CART model on the test set. You will need to get predicted probabilities for the observations in the test set to build the ROC curve and compute the AUC. Remember that you can do this by removing the type=“class” argument when making predictions, and taking the second column of the resulting object.
Plot the ROC curve for the CART model you have estimated. Observe that compared to the logistic regression ROC curve, the CART ROC curve is less smooth than the logistic regression ROC curve. Which of the following explanations for this behavior is most correct? (HINT: Think about what the ROC curve is plotting and what changing the threshold does.)
par(cex=0.8)
colAUC(cbind(p.glm, p.cart), TS$over50k, T)
p.glm p.cart
<=50K vs. >50K 0.9062 0.847
What is the AUC of the CART model on the test set?
par(cex=0.8)
auc.glm = DPP(p.glm, TS$over50k, " >50K")
par(cex=0.8)
auc.cart = DPP(p.cart, TS$over50k, " >50K")
Before building a random forest model, we’ll down-sample our training set. While some modern personal computers can build a random forest model on the entire training set, others might run out of memory when trying to train the model since random forests is much more computationally intensive than CART or Logistic Regression. For this reason, before continuing we will define a new training set to be used when building our random forest model, that contains 2000 randomly selected obervations from the original training set. Do this by running the following commands in your R console (assuming your training set is called “train”):
set.seed(1)
small = TR[sample(nrow(TR), 2000), ]
Let us now build a random forest model to predict “over50k”, using the dataset “trainSmall” as the data used to build the model. Set the seed to 1 again right before building the model, and use all of the other variables in the dataset as independent variables. (If you get an error that random forest “can not handle categorical predictors with more than 32 categories”, re-build the model without the nativecountry variable as one of the independent variables.)
Then, make predictions using this model on the entire test set. What is the accuracy of the model on the test set, using a threshold of 0.5? (Remember that you don’t need a “type” argument when making predictions with a random forest model if you want to use a threshold of 0.5. Also, note that your accuracy might be different from the one reported here, since random forest models can still differ depending on your operating system, even when the random seed is set. )
set.seed(1)
rf1 = randomForest(over50k ~ ., small)
pred = predict(rf1, TS)
table(TS$over50k, pred) %>% {sum(diag(.))/sum(.)} # 0.83488
[1] 0.8515
As we discussed in lecture, random forest models work by building a large collection of trees. As a result, we lose some of the interpretability that comes with CART in terms of seeing how predictions are made and which variables are important. However, we can still compute metrics that give us insight into which variables are important.
One metric that we can look at is the number of times, aggregated over all of the trees in the random forest model, that a certain variable is selected for a split. To view this metric, run the following lines of R code (replace “MODEL” with the name of your random forest model):
vu = varUsed(rf1, count=TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
par(cex=0.8, mar=c(3,7,1,1))
dotchart(vusorted$x, names(rf1$forest$xlevels[vusorted$ix]))
This code produces a chart that for each variable measures the number of times that variable was selected for splitting (the value on the x-axis). Which of the following variables is the most important in terms of the number of splits?
There are many other ‘importance’ metrics, for example
par(cex=0.8)
varImpPlot(rf1)
t0 = Sys.time()
set.seed(1)
rf2 = randomForest(over50k ~ ., TR)
Sys.time() - t0
Time difference of 18.91 secs
Compare the accuracy of models
p.rf1 = predict(rf1, TS, "prob")[,2]
p.rf2 = predict(rf2, TS, "prob")[,2]
px = cbind(glm=p.glm, cart=p.cart, rf_small=p.rf1, rf_full=p.rf2)
apply(px, 2, function(x) {
table(TS$over50k, x > 0.5) %>% {sum(diag(.))/sum(.)}
}) %>% sort
cart rf_small glm rf_full
0.8474 0.8514 0.8552 0.8658
colAUC(px, TS$over50k, T)
glm cart rf_small rf_full
<=50K vs. >50K 0.9062 0.847 0.8972 0.9069
library(doParallel)
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
We now conclude our study of this data set by looking at how CART behaves with different choices of its parameters.
Let us select the cp parameter for our CART model using k-fold cross validation, with k = 10 folds. Do this by using the train function. Set the seed beforehand to 2. Test cp values from 0.002 to 0.1 in 0.002 increments, by using the following command:
cartGrid = expand.grid( .cp = seq(0.002,0.1,0.002))
Also, remember to use the entire training set “train” when building this model. The train function might take some time to run.
t0 = Sys.time()
set.seed(2)
cv1 = train(
over50k ~ ., data = TR, method = "rpart",
trControl = trainControl(method = "cv", number=10),
tuneGrid = expand.grid(cp = seq(0.002,0.1,0.002))
)
Sys.time() - t0
Time difference of 25.08 secs
plot(cv1, main = sprintf("optimal cp at %f", cv1$bestTune$cp) )
Which value of cp does the train function recommend?
cp covered in the reange specified above? If negative, what should we do?Fit a CART model to the training data using this value of cp. What is the prediction accuracy on the test set?
cart1 = rpart(over50k ~ ., TR, method='class', cp=cv1$bestTune$cp)
p.cart1 = pred = predict(cart1, TS)[,2]
table(TS$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)} # 0.8612
[1] 0.8612
Plot the CART tree for this model.
prp(cart1)
How many splits are there?
t0 = Sys.time()
set.seed(2)
cv2 = train(
over50k ~ ., data = TR, method = "rpart",
trControl = trainControl(method="repeatedcv", number=10, repeats=8),
tuneGrid = expand.grid(cp = seq(0,0.002,0.00005))
)
Sys.time() - t0
Time difference of 1.282 mins
plot(cv2, main = sprintf("optimal cp at %f", cv2$bestTune$cp) )
cart2 = rpart(over50k ~ ., TR, method='class', cp=cv2$bestTune$cp)
p.cart2 = pred = predict(cart2, TS)[,2]
px = cbind(px, cart.cv1 = p.cart1, cart.cv2 = p.cart2)
rbind(
Accuracy = apply(px, 2, function(x) {
table(TS$over50k, x > 0.5) %>% {sum(diag(.))/sum(.)} }),
AUC = colAUC(px, TS$over50k) %>% `rownames<-`("AUC")
) %>% t
Accuracy AUC
glm 0.8552 0.9062
cart 0.8474 0.8470
rf_small 0.8514 0.8972
rf_full 0.8658 0.9069
cart.cv1 0.8612 0.8714
cart.cv2 0.8631 0.8925
cv2$bestTune$cp perform better?par(cex=1.25)
auc = colAUC(px[,c(2,4,5,6)], TS$over50k, T)
par(mfcol=c(3,2), mar=c(3,3,4,1), cex=0.7)
for(i in c(1,3,4,2,5,6)) {
DPP(px[,i], TS$over50k, " >50K", title=colnames(px)[i])
}
cor(px)
glm cart rf_small rf_full cart.cv1 cart.cv2
glm 1.0000 0.8614 0.8908 0.9107 0.9058 0.9023
cart 0.8614 1.0000 0.8334 0.8164 0.9189 0.8615
rf_small 0.8908 0.8334 1.0000 0.9163 0.8802 0.8747
rf_full 0.9107 0.8164 0.9163 1.0000 0.8862 0.9139
cart.cv1 0.9058 0.9189 0.8802 0.8862 1.0000 0.9401
cart.cv2 0.9023 0.8615 0.8747 0.9139 0.9401 1.0000
glm_cart = (px[,"glm"] + px[,"cart.cv2"])/2
glm_rf = (px[,"glm"] + px[,"rf_full"])/2
px2 = cbind(px, glm_cart, glm_rf)
rbind(apply(px2, 2, function(x) {
table(TS$over50k, x > 0.5) %>% {sum(diag(.))/sum(.)} }),
colAUC(px2, TS$over50k)) %>% t %>%
data.frame %>% setNames(c("Accuracy", "AUC"))
Accuracy AUC
glm 0.8552 0.9062
cart 0.8474 0.8470
rf_small 0.8514 0.8972
rf_full 0.8658 0.9069
cart.cv1 0.8612 0.8714
cart.cv2 0.8631 0.8925
glm_cart 0.8640 0.9114
glm_rf 0.8662 0.9149
停止平行運算
stopCluster(clust)