Random Forest
# Sparks' code
library(randomForest)
library(tree)
w1.bag<-w1%>%
select(pov:p_raceth, ht)
w1.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w1.bag)
w1.x<-data.frame(w1.x)[,-1]
train1<-sample(1:dim(w1.x)[1], size = .75*dim(w1.x)[1], replace=T)
#fit1<-tree(ht~., data=w1[train1,])
#summary(fit1)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
#do i need to set a seed?
set.seed(1115)
bag.1<-randomForest(ht~., data=w1.x[train1,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.1
##
## Call:
## randomForest(formula = ht ~ ., data = w1.x[train1, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 4.251919
## % Var explained: 9.31
plot(bag.1)

importance(bag.1)
## %IncMSE IncNodePurity
## pov 12.081982 194.13292
## numsib 18.195464 660.38495
## unsafe 17.051316 205.21212
## no_work_p1 15.387385 225.18534
## no_work_p2 14.935813 270.21807
## factor.housit.rent 15.504431 207.58394
## factor.educ_p1.colgrad 13.599404 149.18238
## factor.educ_p1.less.than.hs 10.701313 148.05876
## factor.educ_p2.colgrad 14.564150 151.38705
## factor.educ_p2.less.than.hs 10.681142 148.91952
## foodinsec 12.689325 178.97457
## momar 18.340353 245.64313
## factor.p_raceth.hispanic 25.542000 248.79405
## factor.p_raceth.nh.black 10.019024 131.44684
## factor.p_raceth.nh.multirace 2.873789 56.42755
varImpPlot(bag.1, n.var = 10, main="Figure 3a: Wave 1 Importance Plot", type=1)

########
#install.packages("caret")
#library(caret)
#set.seed(1115)
#train<- createDataPartition(y = eclsk_nomiss$height_1 , p = .80, list=F)
#eclsktrain<-eclsk_nomiss[train,]
#eclsktest<-eclsk_nomiss[-train,]
w2.bag<-w2%>%
select(pov:p_raceth, ht)
w2.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w2.bag)
w2.x<-data.frame(w2.x)[,-1]
train2<-sample(1:dim(w2.x)[1], size = .75*dim(w2.x)[1], replace=T)
#fit2<-tree(ht~., data=w2[train2,])
#summary(fit2)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.2<-randomForest(ht~., data=w2.x[train2,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.2
##
## Call:
## randomForest(formula = ht ~ ., data = w2.x[train2, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 4.266432
## % Var explained: 8.67
plot(bag.2)

importance(bag.2)
## %IncMSE IncNodePurity
## pov 17.011840 161.75805
## numsib 16.645714 607.70405
## unsafe 15.878327 199.54205
## no_work_p1 15.908940 201.06630
## no_work_p2 12.777540 223.80230
## factor.housit.rent 14.892012 149.83485
## factor.educ_p1.colgrad 14.667636 139.73366
## factor.educ_p1.less.than.hs 12.884294 130.57350
## factor.educ_p2.colgrad 13.361971 119.65909
## factor.educ_p2.less.than.hs 10.592934 138.38317
## foodinsec 12.452625 174.05960
## momar 16.292612 214.67143
## factor.p_raceth.hispanic 24.214257 199.01482
## factor.p_raceth.nh.black 8.026341 120.96805
## factor.p_raceth.nh.multirace 6.782638 67.37355
varImpPlot(bag.2, n.var = 10, main="Figure 3b: Wave 2 Importance Plot", type=1)

w3.bag<-w3%>%
select(pov:p_raceth, ht)
w3.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w3.bag)
w3.x<-data.frame(w3.x)[,-1]
train3<-sample(1:dim(w3.x)[1], size = .75*dim(w3.x)[1], replace=T)
#fit3<-tree(ht~., data=w3[train3,])
#summary(fit3)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.3<-randomForest(ht~., data=w3.x[train3,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.3
##
## Call:
## randomForest(formula = ht ~ ., data = w3.x[train3, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 3.700978
## % Var explained: 13.75
plot(bag.3)

importance(bag.3)
## %IncMSE IncNodePurity
## pov 9.6989063 75.14266
## numsib 13.5430223 328.82100
## unsafe 13.4742371 118.26337
## no_work_p1 13.8033583 120.59779
## no_work_p2 13.7023672 108.55249
## factor.housit.rent 13.3814539 97.19874
## factor.educ_p1.colgrad 12.2804934 89.62324
## factor.educ_p1.less.than.hs 10.4213748 68.88951
## factor.educ_p2.colgrad 11.7888434 82.50310
## factor.educ_p2.less.than.hs 9.5946458 66.00929
## foodinsec 9.2112673 78.23933
## momar 11.7364773 115.03531
## factor.p_raceth.hispanic 11.1483344 85.16419
## factor.p_raceth.nh.black 5.9732167 57.02708
## factor.p_raceth.nh.multirace 0.1047351 31.22607
varImpPlot(bag.3, n.var = 10, main="Figure 3c: Wave 3 Importance Plot", type=1)

w4.bag<-w4%>%
select(pov:p_raceth, ht)
w4.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w4.bag)
w4.x<-data.frame(w4.x)[,-1]
train4<-sample(1:dim(w4.x)[1], size = .75*dim(w4.x)[1], replace=T)
#fit4<-tree(ht~., data=w4[train4,])
#summary(fit4)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.4<-randomForest(ht~., data=w4.x[train4,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.4
##
## Call:
## randomForest(formula = ht ~ ., data = w4.x[train4, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 4.630406
## % Var explained: 9.01
plot(bag.4)

importance(bag.4)
## %IncMSE IncNodePurity
## pov 14.659374 189.75655
## numsib 20.326080 717.99160
## unsafe 15.006222 230.76137
## no_work_p1 15.717137 209.25890
## no_work_p2 13.607022 264.49976
## factor.housit.rent 17.374726 206.31964
## factor.educ_p1.colgrad 15.671398 177.81358
## factor.educ_p1.less.than.hs 11.256389 138.40168
## factor.educ_p2.colgrad 11.533961 153.71200
## factor.educ_p2.less.than.hs 12.089740 148.80130
## foodinsec 12.718450 204.78416
## momar 14.353175 222.09443
## factor.p_raceth.hispanic 17.289289 187.79595
## factor.p_raceth.nh.black 13.295604 141.65453
## factor.p_raceth.nh.multirace 6.062816 58.66558
varImpPlot(bag.4, n.var = 10, main="Figure 3d: Wave 4 Importance Plot", type=1)

w5.bag<-w5%>%
select(pov:p_raceth, ht)
w5.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w5.bag)
w5.x<-data.frame(w5.x)[,-1]
train5<-sample(1:dim(w5.x)[1], size = .75*dim(w5.x)[1], replace=T)
#fit5<-tree(ht~., data=w5[train5,])
#summary(fit5)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.5<-randomForest(ht~., data=w5.x[train5,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.5
##
## Call:
## randomForest(formula = ht ~ ., data = w5.x[train5, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 4.542151
## % Var explained: 14.14
plot(bag.5)

importance(bag.5)
## %IncMSE IncNodePurity
## pov 13.524195 118.72111
## numsib 13.243980 387.12793
## unsafe 11.688958 118.50416
## no_work_p1 11.256049 126.04351
## no_work_p2 10.214078 124.61516
## factor.housit.rent 14.578416 131.67036
## factor.educ_p1.colgrad 11.221709 100.16703
## factor.educ_p1.less.than.hs 10.624380 84.58432
## factor.educ_p2.colgrad 11.657341 123.44930
## factor.educ_p2.less.than.hs 11.003396 96.48459
## foodinsec 12.311228 64.25494
## momar 14.725348 124.95744
## factor.p_raceth.hispanic 13.955808 104.07463
## factor.p_raceth.nh.black 8.171149 66.75898
## factor.p_raceth.nh.multirace -1.656494 24.19941
varImpPlot(bag.5, n.var = 10, main="Figure 3e: Wave 5 Importance Plot", type=1)

w6.bag<-w6%>%
select(pov:p_raceth, ht)
w6.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w6.bag)
w6.x<-data.frame(w6.x)[,-1]
train6<-sample(1:dim(w6.x)[1], size = .75*dim(w6.x)[1], replace=T)
#fit6<-tree(ht~., data=w6[train6,])
#summary(fit6)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.6<-randomForest(ht~., data=w6.x[train6,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.6
##
## Call:
## randomForest(formula = ht ~ ., data = w6.x[train6, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 5.271756
## % Var explained: 8.9
plot(bag.6)

importance(bag.6)
## %IncMSE IncNodePurity
## pov 15.796287 214.42084
## numsib 16.753503 759.96778
## unsafe 14.768619 247.32278
## no_work_p1 17.000607 219.53587
## no_work_p2 10.981438 288.63747
## factor.housit.rent 20.225983 280.56683
## factor.educ_p1.colgrad 12.717370 162.48311
## factor.educ_p1.less.than.hs 14.751776 154.98413
## factor.educ_p2.colgrad 11.177945 175.65465
## factor.educ_p2.less.than.hs 13.598557 185.11116
## foodinsec 13.918128 233.04447
## momar 14.546021 277.30122
## factor.p_raceth.hispanic 17.972839 199.90928
## factor.p_raceth.nh.black 13.647015 194.93589
## factor.p_raceth.nh.multirace 3.392382 52.94607
varImpPlot(bag.6, n.var = 10, main="Figure 3f: Wave 6 Importance Plot", type=1)

w7.bag<-w7%>%
select(pov:p_raceth, ht)
w7.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w7.bag)
w7.x<-data.frame(w7.x)[,-1]
train7<-sample(1:dim(w7.x)[1], size = .75*dim(w7.x)[1], replace=T)
#fit7<-tree(ht~., data=w7[train7,])
#summary(fit7)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.7<-randomForest(ht~., data=w7.x[train7,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.7
##
## Call:
## randomForest(formula = ht ~ ., data = w7.x[train7, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.103824
## % Var explained: 8.67
plot(bag.7)

importance(bag.7)
## %IncMSE IncNodePurity
## pov 11.826071 254.98828
## numsib 18.210877 856.62337
## unsafe 16.254009 299.50731
## no_work_p1 15.925233 304.45470
## no_work_p2 12.375028 361.81374
## factor.housit.rent 15.265002 299.19938
## factor.educ_p1.colgrad 14.666361 229.68007
## factor.educ_p1.less.than.hs 14.404057 244.04205
## factor.educ_p2.colgrad 14.491812 189.80077
## factor.educ_p2.less.than.hs 13.674030 211.82899
## foodinsec 11.370765 228.08586
## momar 16.136435 258.57610
## factor.p_raceth.hispanic 20.177162 270.20099
## factor.p_raceth.nh.black 15.178916 177.06352
## factor.p_raceth.nh.multirace 1.466651 62.04629
varImpPlot(bag.7, n.var = 10, main="Figure 3g: Wave 7 Importance Plot", type=1)

w8.bag<-w8%>%
select(pov:p_raceth, ht)
w8.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w8.bag)
w8.x<-data.frame(w8.x)[,-1]
train8<-sample(1:dim(w8.x)[1], size = .75*dim(w8.x)[1], replace=T)
#fit8<-tree(ht~., data=w8[train8,])
#summary(fit8)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.8<-randomForest(ht~., data=w8.x[train8,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.8
##
## Call:
## randomForest(formula = ht ~ ., data = w8.x[train8, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.69945
## % Var explained: 8.73
plot(bag.8)

importance(bag.8)
## %IncMSE IncNodePurity
## pov 13.63079 238.1810
## numsib 20.88336 978.2376
## unsafe 14.79700 331.6402
## no_work_p1 13.64885 283.8913
## no_work_p2 11.66433 354.8015
## factor.housit.rent 15.23056 324.6957
## factor.educ_p1.colgrad 12.73304 241.7123
## factor.educ_p1.less.than.hs 12.43956 214.4359
## factor.educ_p2.colgrad 13.27633 216.2194
## factor.educ_p2.less.than.hs 11.18976 206.9691
## foodinsec 11.88842 277.4814
## momar 13.56238 309.8843
## factor.p_raceth.hispanic 19.30576 289.5882
## factor.p_raceth.nh.black 11.43810 201.4558
## factor.p_raceth.nh.multirace 8.93684 153.3197
varImpPlot(bag.8, n.var = 10, main="Figure 3h: Wave 8 Importance Plot", type=1)

w9.bag<-w9%>%
select(pov:p_raceth, ht)
w9.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
+no_work_p2+factor(housit)+factor(educ_p1)
+factor(educ_p2)+foodinsec+momar
+factor(p_raceth)+ ht, data=w9.bag)
w9.x<-data.frame(w9.x)[,-1]
train9<-sample(1:dim(w9.x)[1], size = .75*dim(w9.x)[1], replace=T)
#fit9<-tree(ht~., data=w9[train9,])
#summary(fit9)
set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry
bag.9<-randomForest(ht~., data=w9.x[train9,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.9
##
## Call:
## randomForest(formula = ht ~ ., data = w9.x[train9, ], mtry = 3, ntree = 100, importance = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 8.12711
## % Var explained: 7.33
plot(bag.9)

importance(bag.9)
## %IncMSE IncNodePurity
## pov 16.281199 273.7308
## numsib 12.676136 961.5631
## unsafe 11.848656 322.9804
## no_work_p1 16.031613 272.2088
## no_work_p2 17.149476 376.2385
## factor.housit.rent 15.326968 347.9543
## factor.educ_p1.colgrad 16.108018 268.0056
## factor.educ_p1.less.than.hs 8.384680 180.7207
## factor.educ_p2.colgrad 17.232970 275.9225
## factor.educ_p2.less.than.hs 9.178054 199.1298
## foodinsec 14.652371 331.5965
## momar 11.930895 368.0084
## factor.p_raceth.hispanic 19.608276 309.7742
## factor.p_raceth.nh.black 11.691222 256.4184
## factor.p_raceth.nh.multirace 9.019817 145.1444
varImpPlot(bag.9, n.var = 10, main="Figure 3i: Wave 9 Importance Plot", type=1)
