Random Forest
# Sparks' code
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:MuMIn':
##
## importance
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
w1.bag<-w1%>%
select(pov:p_raceth, ht)
train1<-sample(1:dim(w1.bag)[1], size = .75*dim(w1.bag)[1], replace=T)
fit1<-tree(ht~., data=w1[train1,])
summary(fit1)
##
## Regression tree:
## tree(formula = ht ~ ., data = w1[train1, ])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 1.001 = 4983 / 4978
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.73700 -0.69660 -0.03027 0.00000 0.69420 3.47300
#set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
#do i need to set a seed?
set.seed(1115)
bag.1<-randomForest(ht~., data=w1.bag[train1,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.1
##
## Call:
## randomForest(formula = ht ~ ., data = w1.bag[train1, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8551284
## % Var explained: 14.56
plot(bag.1)

importance(bag.1)
## %IncMSE IncNodePurity
## pov 24.42074 59.25503
## numsib 25.81269 292.41487
## unsafe 21.97669 88.26069
## no_work_p1 27.18944 90.74897
## no_work_p2 21.03245 119.05563
## housit 28.40542 135.60994
## educ_p1 27.71492 193.88444
## educ_p2 29.36381 203.18755
## foodinsec 19.12185 77.08060
## momar 26.05139 96.48814
## p_raceth 35.45974 212.57327
varImpPlot(bag.1, n.var = 10, type=1)

########
#install.packages("caret")
#library(caret)
#set.seed(1115)
#train<- createDataPartition(y = eclsk_nomiss$height_1 , p = .80, list=F)
#eclsktrain<-eclsk_nomiss[train,]
#eclsktest<-eclsk_nomiss[-train,]
w2.bag<-w2%>%
select(pov:p_raceth, ht)
train2<-sample(1:dim(w2.bag)[1], size = .75*dim(w2.bag)[1], replace=T)
fit2<-tree(ht~., data=w2[train2,])
summary(fit2)
##
## Regression tree:
## tree(formula = ht ~ ., data = w2[train2, ])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 0.9663 = 4746 / 4912
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.068000 -0.680500 -0.008947 0.000000 0.677500 2.971000
#set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
set.seed(1115)
bag.2<-randomForest(ht~., data=w2.bag[train2,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.2
##
## Call:
## randomForest(formula = ht ~ ., data = w2.bag[train2, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8435169
## % Var explained: 12.75
plot(bag.2)

importance(bag.2)
## %IncMSE IncNodePurity
## pov 29.47649 65.11205
## numsib 29.18736 235.44300
## unsafe 22.16653 93.37239
## no_work_p1 26.07398 101.41337
## no_work_p2 19.26270 93.26967
## housit 29.32268 132.17652
## educ_p1 31.74262 188.83719
## educ_p2 27.80882 202.30202
## foodinsec 19.02831 69.84303
## momar 23.39299 74.46233
## p_raceth 40.73251 239.20438
varImpPlot(bag.2, n.var = 10, type=1)

w3.bag<-w3%>%
select(pov:p_raceth, ht)
train3<-sample(1:dim(w3.bag)[1], size = .75*dim(w3.bag)[1], replace=T)
fit3<-tree(ht~., data=w3[train3,])
summary(fit3)
##
## Regression tree:
## tree(formula = ht ~ ., data = w3[train3, ])
## Variables actually used in tree construction:
## [1] "numsib" "w2p0str" "p_raceth" "momar"
## Number of terminal nodes: 5
## Residual mean deviance: 0.9359 = 1469 / 1570
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.50000 -0.69170 -0.01279 0.00000 0.66410 2.56800
#set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
set.seed(1115)
bag.3<-randomForest(ht~., data=w3.bag[train3,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.3
##
## Call:
## randomForest(formula = ht ~ ., data = w3.bag[train3, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.781862
## % Var explained: 20.83
plot(bag.3)

importance(bag.3)
## %IncMSE IncNodePurity
## pov 14.82251 29.48142
## numsib 27.36234 124.14892
## unsafe 17.39418 38.37656
## no_work_p1 17.14680 44.33865
## no_work_p2 11.40878 33.58649
## housit 19.23693 67.61158
## educ_p1 21.35338 97.46851
## educ_p2 20.48757 97.94207
## foodinsec 15.62485 34.99000
## momar 18.44198 31.92355
## p_raceth 24.37219 102.69041
varImpPlot(bag.3, n.var = 10, type=1)

w4.bag<-w4%>%
select(pov:p_raceth, ht)
train4<-sample(1:dim(w4.bag)[1], size = .75*dim(w4.bag)[1], replace=T)
fit4<-tree(ht~., data=w4[train4,])
summary(fit4)
##
## Regression tree:
## tree(formula = ht ~ ., data = w4[train4, ])
## Variables actually used in tree construction:
## [1] "p_raceth"
## Number of terminal nodes: 2
## Residual mean deviance: 0.9455 = 4574 / 4838
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.02500 -0.68290 -0.02034 0.00000 0.67210 3.37000
#set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
set.seed(1115)
bag.4<-randomForest(ht~., data=w4.bag[train4,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.4
##
## Call:
## randomForest(formula = ht ~ ., data = w4.bag[train4, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8087758
## % Var explained: 15.65
plot(bag.4)

importance(bag.4)
## %IncMSE IncNodePurity
## pov 20.36075 66.22028
## numsib 42.42226 283.16476
## unsafe 22.82936 94.99976
## no_work_p1 25.35570 88.07974
## no_work_p2 23.16391 105.02468
## housit 33.35496 131.10107
## educ_p1 28.68554 196.82819
## educ_p2 29.69422 195.44608
## foodinsec 16.49226 88.65017
## momar 28.44034 96.41159
## p_raceth 40.29074 231.43852
varImpPlot(bag.4, n.var = 10, type=1)

w5.bag<-w5%>%
select(pov:p_raceth, ht)
train5<-sample(1:dim(w5.bag)[1], size = .75*dim(w5.bag)[1], replace=T)
fit5<-tree(ht~., data=w5[train5,])
summary(fit5)
##
## Regression tree:
## tree(formula = ht ~ ., data = w5[train5, ])
## Variables actually used in tree construction:
## [1] "unsafe" "p_raceth" "cluster" "w2p0" "w2p0str" "w2p0psu"
## Number of terminal nodes: 7
## Residual mean deviance: 0.8346 = 1242 / 1488
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.545000 -0.646700 -0.002655 0.000000 0.630500 2.691000
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.5<-randomForest(ht~., data=w5.bag[train5,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.5
##
## Call:
## randomForest(formula = ht ~ ., data = w5.bag[train5, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.7448089
## % Var explained: 18.03
plot(bag.5)

importance(bag.5)
## %IncMSE IncNodePurity
## pov 13.11114 28.24855
## numsib 17.79020 103.44819
## unsafe 23.13425 50.01256
## no_work_p1 12.94671 39.35452
## no_work_p2 11.07554 30.76382
## housit 22.46761 52.60019
## educ_p1 19.48833 82.78025
## educ_p2 15.82802 78.85361
## foodinsec 12.45900 35.58386
## momar 16.80475 34.69872
## p_raceth 18.15146 85.73139
varImpPlot(bag.5, n.var = 10, type=1)

w6.bag<-w6%>%
select(pov:p_raceth, ht)
train6<-sample(1:dim(w6.bag)[1], size = .75*dim(w6.bag)[1], replace=T)
fit6<-tree(ht~., data=w6[train6,])
summary(fit6)
##
## Regression tree:
## tree(formula = ht ~ ., data = w6[train6, ])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 0.9729 = 4379 / 4501
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.65100 -0.72120 -0.03149 0.00000 0.69650 2.72600
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.6<-randomForest(ht~., data=w6.bag[train6,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.6
##
## Call:
## randomForest(formula = ht ~ ., data = w6.bag[train6, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8247373
## % Var explained: 14.87
plot(bag.6)

importance(bag.6)
## %IncMSE IncNodePurity
## pov 21.70676 59.90709
## numsib 30.57557 265.23317
## unsafe 25.54775 99.23020
## no_work_p1 23.40861 88.19274
## no_work_p2 24.54181 90.49289
## housit 28.56357 125.20087
## educ_p1 36.60046 174.69398
## educ_p2 34.61707 172.40135
## foodinsec 20.66856 75.05444
## momar 27.67156 85.08665
## p_raceth 35.54079 190.76587
varImpPlot(bag.6, n.var = 10, type=1)

w7.bag<-w7%>%
select(pov:p_raceth, ht)
train7<-sample(1:dim(w7.bag)[1], size = .75*dim(w7.bag)[1], replace=T)
fit7<-tree(ht~., data=w7[train7,])
summary(fit7)
##
## Regression tree:
## tree(formula = ht ~ ., data = w7[train7, ])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 0.9415 = 3971 / 4218
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.86300 -0.68650 -0.03686 0.00000 0.66410 3.06800
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.7<-randomForest(ht~., data=w7.bag[train7,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.7
##
## Call:
## randomForest(formula = ht ~ ., data = w7.bag[train7, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8167689
## % Var explained: 13.07
plot(bag.7)

importance(bag.7)
## %IncMSE IncNodePurity
## pov 21.99401 50.41210
## numsib 29.66583 268.10554
## unsafe 21.39211 73.64316
## no_work_p1 21.63054 72.20956
## no_work_p2 17.74599 73.04433
## housit 22.33093 124.87666
## educ_p1 27.55980 172.48407
## educ_p2 28.90741 168.33221
## foodinsec 20.67990 64.89910
## momar 22.36749 78.61520
## p_raceth 37.20047 193.34702
varImpPlot(bag.7, n.var = 10, type=1)

w8.bag<-w8%>%
select(pov:p_raceth, ht)
train8<-sample(1:dim(w8.bag)[1], size = .75*dim(w8.bag)[1], replace=T)
fit8<-tree(ht~., data=w8[train8,])
summary(fit8)
##
## Regression tree:
## tree(formula = ht ~ ., data = w8[train8, ])
## Variables actually used in tree construction:
## [1] "p_raceth"
## Number of terminal nodes: 2
## Residual mean deviance: 0.9632 = 3824 / 3970
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.72900 -0.71980 -0.04283 0.00000 0.67210 2.98700
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.8<-randomForest(ht~., data=w8.bag[train8,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.8
##
## Call:
## randomForest(formula = ht ~ ., data = w8.bag[train8, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8187055
## % Var explained: 15.64
plot(bag.8)

importance(bag.8)
## %IncMSE IncNodePurity
## pov 18.50483 64.49790
## numsib 32.96982 249.30611
## unsafe 23.68451 80.69116
## no_work_p1 24.92581 82.47215
## no_work_p2 16.83916 82.13715
## housit 29.19448 125.20113
## educ_p1 27.16275 165.90391
## educ_p2 33.48959 170.50942
## foodinsec 19.23581 66.52118
## momar 22.93397 62.67491
## p_raceth 36.52496 201.49939
varImpPlot(bag.8, n.var = 10, type=1)

w9.bag<-w9%>%
select(pov:p_raceth, ht)
train9<-sample(1:dim(w9.bag)[1], size = .75*dim(w9.bag)[1], replace=T)
fit9<-tree(ht~., data=w9[train9,])
summary(fit9)
##
## Regression tree:
## tree(formula = ht ~ ., data = w9[train9, ])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 0.981 = 3697 / 3769
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.56100 -0.73770 0.00197 0.00000 0.65490 2.85100
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.9<-randomForest(ht~., data=w9.bag[train9,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.9
##
## Call:
## randomForest(formula = ht ~ ., data = w9.bag[train9, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.8468837
## % Var explained: 13.3
plot(bag.9)

importance(bag.9)
## %IncMSE IncNodePurity
## pov 21.27051 44.21024
## numsib 28.23394 195.14496
## unsafe 24.18232 71.80804
## no_work_p1 23.65815 71.03897
## no_work_p2 18.10347 71.22013
## housit 25.89839 97.06490
## educ_p1 27.78051 164.68109
## educ_p2 30.37683 153.83338
## foodinsec 21.37433 57.79928
## momar 21.30552 80.85747
## p_raceth 34.28864 179.94836
varImpPlot(bag.9, n.var = 10, type=1)
