#load required packages
library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(geepack)
library(MuMIn)
library(survey)
library(questionr)

setwd("C:/Users/chrys/Documents/UTSA")
#load eclsk data
load("eclsk_k5.Rdata")

mydat<-c("childid", "x_chsex_r", "p2hscale", "x2povty", "x2numsib","p2safepl", "x1par1emp", "x1par2emp", "p4housit", "x12par1ed_i", "x12par2ed_i", "x2fsstat2", "x12momar", "x2par1rac", "x1height", "x2height", "x3height", "x4height", "x5height", "x6height", "x7height", "x8height", "x9height", "w2p0", "w2p0str", "w2p0psu", "w9c19p_20", "w9c19p_2str", "w9c19p_2psu")

eclsk<-data.frame(eclskk5[,mydat])

rm(eclskk5)
rm(mydat)
#save(eclsk, file="eclsk.Rdata")

# Recode time constant Variables
#recode perceived child health
eclsk$chhealth<-Recode(eclsk$p2hscale, recodes = "1:2='0Exc/VGood'; 3='1Good'; 4='2Poor'; else=NA", as.factor=TRUE)

#recode poverty status
eclsk$pov<-Recode(eclsk$x2povty, recodes ="1=1; 2:3=0; -9=NA")

#recode number of siblings
eclsk$numsib<-Recode(eclsk$x2numsib, recodes = "-9=NA")

#recode unsafe neighborhood
eclsk$unsafe<-Recode(eclsk$p2safepl , recodes = "1:2=1; 3=0; else=NA")

#parent 1 employment
eclsk$no_work_p1<-Recode(eclsk$x1par1emp, recodes = "1:2=0; 3:4=1; -9=NA")
#eclsk$no_work_p1<-Recode(eclsk$x1par1emp, recodes = "1:2='working'; 3:4='not working'; #-9=NA", as.factor=TRUE)
#eclsk$no_work_p1<-relevel(eclsk$no_work_p1, ref='working')

#parent 2 employment
eclsk$no_work_p2<-Recode(eclsk$x1par2emp, recodes = "1:2=0; 3:4=1; -9=NA")
#eclsk$no_work_p2<-Recode(eclsk$x1par2emp, recodes = "1:2='working'; 3:4='not working'; #-9=NA", as.factor=TRUE)
#eclsk$no_work_p2<-relevel(eclsk$no_work_p2, ref='working')

#housing situation
eclsk$housit<-Recode(eclsk$p4housit, recodes = "1='own'; 2='rent'; else=NA", as.factor=TRUE)

#child bmi
#eclsk$bmi<-Recode(eclsk$x2bmi, recodes = "-9=NA")

#parent 1 education
eclsk$educ_p1<-Recode(eclsk$x12par1ed_i, recodes = "1:2='less than hs'; 3:5='hsgrad'; 6:9='colgrad'; else=NA", as.factor=TRUE)#4:5='somecol'
eclsk$educ_p1<-relevel(eclsk$educ_p1, ref='hsgrad')

#parent 2 education
eclsk$educ_p2<-Recode(eclsk$x12par2ed_i, recodes = "1:2='less than hs'; 3:5='hsgrad'; 6:9='colgrad'; else=NA", as.factor=TRUE) #4:5='somecol'
eclsk$educ_p2<-relevel(eclsk$educ_p2, ref='hsgrad')

#food insecurity
eclsk$foodinsec<-Recode(eclsk$x2fsstat2, recodes="2:3=1; 1=0; else=NA")

#mother married at time of birth
eclsk$momar<-Recode(eclsk$x12momar, recodes= "1=1; 2=0; else=NA")

#race
eclsk$p_raceth<-Recode(eclsk$x2par1rac, recodes="1='nh white'; 2='nh black'; 3:4='hispanic'; 8='nh multirace'; else=NA", as.factor=TRUE)
#5:7='nh other';
eclsk$p_raceth<-relevel(eclsk$p_raceth, ref='nh white')

#recode sex
eclsk$male<-Recode(eclsk$x_chsex_r, recodes = "1=1; 2=0; -9=NA", as.factor=TRUE)

# Recode time varying variables
#child height x1
eclsk$height_1<-Recode(eclsk$x1height, recodes = "-9:-7=NA")

#child height x2
eclsk$height_2<-Recode(eclsk$x2height, recodes = "-9:-7=NA")

#child height x3
eclsk$height_3<-Recode(eclsk$x3height, recodes = "-9:-7=NA")

#child height x4
eclsk$height_4<-Recode(eclsk$x4height, recodes = "-9:-7=NA")

#child height x5
eclsk$height_5<-Recode(eclsk$x5height, recodes = "-9:-7=NA")

#child height x6
eclsk$height_6<-Recode(eclsk$x6height, recodes = "-9:-7=NA")

#child height x7
eclsk$height_7<-Recode(eclsk$x7height, recodes = "-9:-7=NA")

#child height x8
eclsk$height_8<-Recode(eclsk$x8height, recodes = "-9:-7=NA")

#child height x9
eclsk$height_9<-Recode(eclsk$x9height, recodes = "-9=NA; -7=NA")

Reshape into longitudinal format

library(data.table)
library(magrittr)

out<-melt(setDT(eclsk), id = "childid",
          measure.vars = list(ht=c("height_1", "height_2", 
                                   "height_3", "height_4",
                                   "height_5", "height_6",
                                   "height_7", "height_8", 
                                   "height_9")))%>%
  setorder(childid)
#head(out, n=20)

e.long<-eclsk%>%
  select(childid, male, chhealth, pov, numsib, unsafe, 
         no_work_p1,  no_work_p2, housit, educ_p1, educ_p2, 
         foodinsec, momar, p_raceth, w9c19p_20, w9c19p_2str, w9c19p_2psu)%>%
  left_join(., out, "childid")
e.long$wave<-rep(c(1,2,3,4,5,6,7,8,9), 
                 length(unique(e.long$childid)))
#head(e.long, n=10)
 
e.long.comp<-e.long%>%
  rename(ht = value)%>%
  select(-c(18))%>%
  filter(complete.cases(.))

rm(out)
rm(e.long)
#rm(eclsk)

w1 <- e.long.comp %>%
  filter(wave==1)

K Means Analysis

library(caret)

x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth), data=w1)

x<-data.frame(x)[,-1]

determine optimal number of clusters

library(ClusterR)
ss<-NULL
for(i in 1:10){
  km<-kmeans(x=x, nstart = 10, centers = i)
  ss[i]<-km$betweenss/km$totss
}

plot(x=2:10, y=diff(ss), main="Figure 1: Optimal Cluster Analysis", xlab="Number of Clusters", ylab="Relative Difference in Variance")

#3 clusters using w1 data

K Means

#using w1 data
set.seed(1115)
km2<-kmeans(x = x, center = 3, nstart = 10)
km2$centers

##          pov   numsib    unsafe no_work_p1 no_work_p2 factor.housit.rent
## 1 0.25132837 1.234325 0.3517535  0.4165781 0.07757705         0.36291180
## 2 0.00707428 1.252148 0.1222840  0.2395149 0.03385548         0.05154118
## 3 0.29973118 3.424731 0.2741935  0.5040323 0.07526882         0.26209677
##   factor.educ_p1.colgrad factor.educ_p1.less.than.hs factor.educ_p2.colgrad
## 1             0.02922423                   0.1710946             0.04516472
## 2             0.88175846                   0.0000000             0.72511369
## 3             0.35349462                   0.1653226             0.32930108
##   factor.educ_p2.less.than.hs  foodinsec     momar factor.p_raceth.hispanic
## 1                 0.208289054 0.14240170 0.6753454               0.34537726
## 2                 0.002526529 0.02526529 0.9429005               0.05861546
## 3                 0.170698925 0.15860215 0.8212366               0.27688172
##   factor.p_raceth.nh.black factor.p_raceth.nh.multirace
## 1               0.05685441                   0.01540914
## 2               0.03840323                   0.01869631
## 3               0.06182796                   0.01075269

#kmean_clust<-as.data.frame(km2$centers)

#write.csv(kmean_clust, file="C:/Users/chrys/Documents/UTSA/kmean_clust.csv")

Add cluster variable to ECLSK

#some ids not in wave 1, do not get assigned clusters
w1$cluster<-km2$cluster
w01<-w1%>%select(childid, cluster)
e.long.comp<-merge(e.long.comp, w01, by=c("childid"), all.x=TRUE )
rm(w01)
rm(x)

Plot Height Over Time

#out<-e.long.comp%>%
#  filter(is.na(cluster)==F)%>%
#  group_by(wave, cluster, male)%>%
#  summarise(meanht=(wtd.mean(ht, weights=w9c19p_20, na.rm=T)))%>%
#  arrange(wave)%>%
#  ungroup()%>%
#  arrange(wave, cluster)
#head(out)

#w1%>%
#  ggplot()+
#  geom_boxplot(aes(x=cluster, 
#                   y=ht,
#                   color=cluster))+
#  facet_grid(~male)

#out%>%
#  ggplot()+
#  geom_line(aes(wave, meanht, color=cluster, group=cluster))+
#  facet_wrap(~male)

#############
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~w9c19p_2psu, strata=~w9c19p_2str, weights=~w9c19p_20, data = e.long.comp, nest=TRUE)

out2<-svyby(~ht, ~cluster+wave+male, des, FUN=svymean)
#head(out2)

out2$lower<- out2$ht - 1.96*out2$se
out2$upper<- out2$ht + 1.96*out2$se

male.names <- c("Female", "Male")
names(male.names) <- c(0, 1)

fig2 <- out2%>%
  ggplot(aes(x = factor(wave), y =  ht))+
  geom_line(aes(color=factor(cluster), group=factor(cluster)))+
  scale_color_viridis_d(option = "C", begin = .2, end = .8, 
                        labels=c("Cluster 1", "Cluster 2", "Cluster 3")) +
  facet_wrap(~male, labeller = labeller(male = male.names)) +
  ggtitle("Figure 2", subtitle = "Height plotted by survey wave and K Means Cluster")+
  ylab("Height (cm)") + xlab("Survey Wave")+
  labs(caption = "Source: ECLSK 2011")+
  theme_light()+
  theme(axis.title = element_text(size = 12))+
  theme(legend.title=element_blank())

fig2

#ggsave(filename = "figure2.png", plot = fig2, dpi = "print", height = 4, width = 6)

w2 <- e.long.comp %>%
  filter(wave==2, is.na(cluster)==F)

w3 <- e.long.comp %>%
  filter(wave==3, is.na(cluster)==F)

w4 <- e.long.comp %>%
  filter(wave==4, is.na(cluster)==F)

w5 <- e.long.comp %>%
  filter(wave==5, is.na(cluster)==F)

w6 <- e.long.comp %>%
  filter(wave==6, is.na(cluster)==F)

w7 <- e.long.comp %>%
  filter(wave==7, is.na(cluster)==F)

w8 <- e.long.comp %>%
  filter(wave==8, is.na(cluster)==F)

w9 <- e.long.comp %>%
  filter(wave==9, is.na(cluster)==F)

Random Forest

# Sparks' code
library(randomForest)
library(tree)

w1.bag<-w1%>%
  select(pov:p_raceth, ht)

w1.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w1.bag)
w1.x<-data.frame(w1.x)[,-1]

train1<-sample(1:dim(w1.x)[1], size = .75*dim(w1.x)[1], replace=T)

#fit1<-tree(ht~., data=w1[train1,])
#summary(fit1)

#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

#do i need to set a seed?
set.seed(1115)
bag.1<-randomForest(ht~., data=w1.x[train1,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.1

## 
## Call:
##  randomForest(formula = ht ~ ., data = w1.x[train1, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 4.251919
##                     % Var explained: 9.31

plot(bag.1)

importance(bag.1)

##                                %IncMSE IncNodePurity
## pov                          12.081982     194.13292
## numsib                       18.195464     660.38495
## unsafe                       17.051316     205.21212
## no_work_p1                   15.387385     225.18534
## no_work_p2                   14.935813     270.21807
## factor.housit.rent           15.504431     207.58394
## factor.educ_p1.colgrad       13.599404     149.18238
## factor.educ_p1.less.than.hs  10.701313     148.05876
## factor.educ_p2.colgrad       14.564150     151.38705
## factor.educ_p2.less.than.hs  10.681142     148.91952
## foodinsec                    12.689325     178.97457
## momar                        18.340353     245.64313
## factor.p_raceth.hispanic     25.542000     248.79405
## factor.p_raceth.nh.black     10.019024     131.44684
## factor.p_raceth.nh.multirace  2.873789      56.42755

varImpPlot(bag.1, n.var = 10, main="Figure 3a: Wave 1 Importance Plot", type=1)

########
#install.packages("caret")
#library(caret)
#set.seed(1115)
#train<- createDataPartition(y = eclsk_nomiss$height_1 , p = .80, list=F)

#eclsktrain<-eclsk_nomiss[train,]
#eclsktest<-eclsk_nomiss[-train,]

w2.bag<-w2%>%
  select(pov:p_raceth, ht)

w2.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w2.bag)
w2.x<-data.frame(w2.x)[,-1]

train2<-sample(1:dim(w2.x)[1], size = .75*dim(w2.x)[1], replace=T)

#fit2<-tree(ht~., data=w2[train2,])
#summary(fit2)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.2<-randomForest(ht~., data=w2.x[train2,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.2

## 
## Call:
##  randomForest(formula = ht ~ ., data = w2.x[train2, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 4.266432
##                     % Var explained: 8.67

plot(bag.2)

importance(bag.2)

##                                %IncMSE IncNodePurity
## pov                          17.011840     161.75805
## numsib                       16.645714     607.70405
## unsafe                       15.878327     199.54205
## no_work_p1                   15.908940     201.06630
## no_work_p2                   12.777540     223.80230
## factor.housit.rent           14.892012     149.83485
## factor.educ_p1.colgrad       14.667636     139.73366
## factor.educ_p1.less.than.hs  12.884294     130.57350
## factor.educ_p2.colgrad       13.361971     119.65909
## factor.educ_p2.less.than.hs  10.592934     138.38317
## foodinsec                    12.452625     174.05960
## momar                        16.292612     214.67143
## factor.p_raceth.hispanic     24.214257     199.01482
## factor.p_raceth.nh.black      8.026341     120.96805
## factor.p_raceth.nh.multirace  6.782638      67.37355

varImpPlot(bag.2, n.var = 10, main="Figure 3b: Wave 2 Importance Plot", type=1)

w3.bag<-w3%>%
  select(pov:p_raceth, ht)

w3.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w3.bag)
w3.x<-data.frame(w3.x)[,-1]

train3<-sample(1:dim(w3.x)[1], size = .75*dim(w3.x)[1], replace=T)

#fit3<-tree(ht~., data=w3[train3,])
#summary(fit3)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.3<-randomForest(ht~., data=w3.x[train3,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.3

## 
## Call:
##  randomForest(formula = ht ~ ., data = w3.x[train3, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 3.700978
##                     % Var explained: 13.75

plot(bag.3)

importance(bag.3)

##                                 %IncMSE IncNodePurity
## pov                           9.6989063      75.14266
## numsib                       13.5430223     328.82100
## unsafe                       13.4742371     118.26337
## no_work_p1                   13.8033583     120.59779
## no_work_p2                   13.7023672     108.55249
## factor.housit.rent           13.3814539      97.19874
## factor.educ_p1.colgrad       12.2804934      89.62324
## factor.educ_p1.less.than.hs  10.4213748      68.88951
## factor.educ_p2.colgrad       11.7888434      82.50310
## factor.educ_p2.less.than.hs   9.5946458      66.00929
## foodinsec                     9.2112673      78.23933
## momar                        11.7364773     115.03531
## factor.p_raceth.hispanic     11.1483344      85.16419
## factor.p_raceth.nh.black      5.9732167      57.02708
## factor.p_raceth.nh.multirace  0.1047351      31.22607

varImpPlot(bag.3, n.var = 10, main="Figure 3c: Wave 3 Importance Plot", type=1)

w4.bag<-w4%>%
  select(pov:p_raceth, ht)

w4.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w4.bag)
w4.x<-data.frame(w4.x)[,-1]

train4<-sample(1:dim(w4.x)[1], size = .75*dim(w4.x)[1], replace=T)

#fit4<-tree(ht~., data=w4[train4,])
#summary(fit4)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.4<-randomForest(ht~., data=w4.x[train4,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.4

## 
## Call:
##  randomForest(formula = ht ~ ., data = w4.x[train4, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 4.630406
##                     % Var explained: 9.01

plot(bag.4)

importance(bag.4)

##                                %IncMSE IncNodePurity
## pov                          14.659374     189.75655
## numsib                       20.326080     717.99160
## unsafe                       15.006222     230.76137
## no_work_p1                   15.717137     209.25890
## no_work_p2                   13.607022     264.49976
## factor.housit.rent           17.374726     206.31964
## factor.educ_p1.colgrad       15.671398     177.81358
## factor.educ_p1.less.than.hs  11.256389     138.40168
## factor.educ_p2.colgrad       11.533961     153.71200
## factor.educ_p2.less.than.hs  12.089740     148.80130
## foodinsec                    12.718450     204.78416
## momar                        14.353175     222.09443
## factor.p_raceth.hispanic     17.289289     187.79595
## factor.p_raceth.nh.black     13.295604     141.65453
## factor.p_raceth.nh.multirace  6.062816      58.66558

varImpPlot(bag.4, n.var = 10, main="Figure 3d: Wave 4 Importance Plot", type=1)

w5.bag<-w5%>%
  select(pov:p_raceth, ht)

w5.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w5.bag)
w5.x<-data.frame(w5.x)[,-1]

train5<-sample(1:dim(w5.x)[1], size = .75*dim(w5.x)[1], replace=T)

#fit5<-tree(ht~., data=w5[train5,])
#summary(fit5)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.5<-randomForest(ht~., data=w5.x[train5,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.5

## 
## Call:
##  randomForest(formula = ht ~ ., data = w5.x[train5, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 4.542151
##                     % Var explained: 14.14

plot(bag.5)

importance(bag.5)

##                                %IncMSE IncNodePurity
## pov                          13.524195     118.72111
## numsib                       13.243980     387.12793
## unsafe                       11.688958     118.50416
## no_work_p1                   11.256049     126.04351
## no_work_p2                   10.214078     124.61516
## factor.housit.rent           14.578416     131.67036
## factor.educ_p1.colgrad       11.221709     100.16703
## factor.educ_p1.less.than.hs  10.624380      84.58432
## factor.educ_p2.colgrad       11.657341     123.44930
## factor.educ_p2.less.than.hs  11.003396      96.48459
## foodinsec                    12.311228      64.25494
## momar                        14.725348     124.95744
## factor.p_raceth.hispanic     13.955808     104.07463
## factor.p_raceth.nh.black      8.171149      66.75898
## factor.p_raceth.nh.multirace -1.656494      24.19941

varImpPlot(bag.5, n.var = 10, main="Figure 3e: Wave 5 Importance Plot", type=1)

w6.bag<-w6%>%
  select(pov:p_raceth, ht)

w6.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w6.bag)
w6.x<-data.frame(w6.x)[,-1]

train6<-sample(1:dim(w6.x)[1], size = .75*dim(w6.x)[1], replace=T)

#fit6<-tree(ht~., data=w6[train6,])
#summary(fit6)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.6<-randomForest(ht~., data=w6.x[train6,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.6

## 
## Call:
##  randomForest(formula = ht ~ ., data = w6.x[train6, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 5.271756
##                     % Var explained: 8.9

plot(bag.6)

importance(bag.6)

##                                %IncMSE IncNodePurity
## pov                          15.796287     214.42084
## numsib                       16.753503     759.96778
## unsafe                       14.768619     247.32278
## no_work_p1                   17.000607     219.53587
## no_work_p2                   10.981438     288.63747
## factor.housit.rent           20.225983     280.56683
## factor.educ_p1.colgrad       12.717370     162.48311
## factor.educ_p1.less.than.hs  14.751776     154.98413
## factor.educ_p2.colgrad       11.177945     175.65465
## factor.educ_p2.less.than.hs  13.598557     185.11116
## foodinsec                    13.918128     233.04447
## momar                        14.546021     277.30122
## factor.p_raceth.hispanic     17.972839     199.90928
## factor.p_raceth.nh.black     13.647015     194.93589
## factor.p_raceth.nh.multirace  3.392382      52.94607

varImpPlot(bag.6, n.var = 10, main="Figure 3f: Wave 6 Importance Plot", type=1)

w7.bag<-w7%>%
  select(pov:p_raceth, ht)

w7.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w7.bag)
w7.x<-data.frame(w7.x)[,-1]

train7<-sample(1:dim(w7.x)[1], size = .75*dim(w7.x)[1], replace=T)

#fit7<-tree(ht~., data=w7[train7,])
#summary(fit7)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.7<-randomForest(ht~., data=w7.x[train7,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.7

## 
## Call:
##  randomForest(formula = ht ~ ., data = w7.x[train7, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 6.103824
##                     % Var explained: 8.67

plot(bag.7)

importance(bag.7)

##                                %IncMSE IncNodePurity
## pov                          11.826071     254.98828
## numsib                       18.210877     856.62337
## unsafe                       16.254009     299.50731
## no_work_p1                   15.925233     304.45470
## no_work_p2                   12.375028     361.81374
## factor.housit.rent           15.265002     299.19938
## factor.educ_p1.colgrad       14.666361     229.68007
## factor.educ_p1.less.than.hs  14.404057     244.04205
## factor.educ_p2.colgrad       14.491812     189.80077
## factor.educ_p2.less.than.hs  13.674030     211.82899
## foodinsec                    11.370765     228.08586
## momar                        16.136435     258.57610
## factor.p_raceth.hispanic     20.177162     270.20099
## factor.p_raceth.nh.black     15.178916     177.06352
## factor.p_raceth.nh.multirace  1.466651      62.04629

varImpPlot(bag.7, n.var = 10, main="Figure 3g: Wave 7 Importance Plot", type=1)

w8.bag<-w8%>%
  select(pov:p_raceth, ht)

w8.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w8.bag)
w8.x<-data.frame(w8.x)[,-1]

train8<-sample(1:dim(w8.x)[1], size = .75*dim(w8.x)[1], replace=T)

#fit8<-tree(ht~., data=w8[train8,])
#summary(fit8)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.8<-randomForest(ht~., data=w8.x[train8,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.8

## 
## Call:
##  randomForest(formula = ht ~ ., data = w8.x[train8, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 6.69945
##                     % Var explained: 8.73

plot(bag.8)

importance(bag.8)

##                               %IncMSE IncNodePurity
## pov                          13.63079      238.1810
## numsib                       20.88336      978.2376
## unsafe                       14.79700      331.6402
## no_work_p1                   13.64885      283.8913
## no_work_p2                   11.66433      354.8015
## factor.housit.rent           15.23056      324.6957
## factor.educ_p1.colgrad       12.73304      241.7123
## factor.educ_p1.less.than.hs  12.43956      214.4359
## factor.educ_p2.colgrad       13.27633      216.2194
## factor.educ_p2.less.than.hs  11.18976      206.9691
## foodinsec                    11.88842      277.4814
## momar                        13.56238      309.8843
## factor.p_raceth.hispanic     19.30576      289.5882
## factor.p_raceth.nh.black     11.43810      201.4558
## factor.p_raceth.nh.multirace  8.93684      153.3197

varImpPlot(bag.8, n.var = 10, main="Figure 3h: Wave 8 Importance Plot", type=1)

w9.bag<-w9%>%
  select(pov:p_raceth, ht)

w9.x<-model.matrix(~pov+numsib+unsafe+no_work_p1
                +no_work_p2+factor(housit)+factor(educ_p1)
                +factor(educ_p2)+foodinsec+momar
                +factor(p_raceth)+ ht, data=w9.bag)
w9.x<-data.frame(w9.x)[,-1]

train9<-sample(1:dim(w9.x)[1], size = .75*dim(w9.x)[1], replace=T)

#fit9<-tree(ht~., data=w9[train9,])
#summary(fit9)

set.seed(1115)
#t1<-tuneRF(y=eclsk_nomiss$height_1, x=eclsk_nomiss, trace=T, stepFactor = 2, ntreeTry = 1000, plot=T)
#t1 #gewt mtry

bag.9<-randomForest(ht~., data=w9.x[train9,], mtry=3, ntree=100,importance=T) #mtry = 3; choose 3 variables for each tree
bag.9

## 
## Call:
##  randomForest(formula = ht ~ ., data = w9.x[train9, ], mtry = 3,      ntree = 100, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 8.12711
##                     % Var explained: 7.33

plot(bag.9)

importance(bag.9)

##                                %IncMSE IncNodePurity
## pov                          16.281199      273.7308
## numsib                       12.676136      961.5631
## unsafe                       11.848656      322.9804
## no_work_p1                   16.031613      272.2088
## no_work_p2                   17.149476      376.2385
## factor.housit.rent           15.326968      347.9543
## factor.educ_p1.colgrad       16.108018      268.0056
## factor.educ_p1.less.than.hs   8.384680      180.7207
## factor.educ_p2.colgrad       17.232970      275.9225
## factor.educ_p2.less.than.hs   9.178054      199.1298
## foodinsec                    14.652371      331.5965
## momar                        11.930895      368.0084
## factor.p_raceth.hispanic     19.608276      309.7742
## factor.p_raceth.nh.black     11.691222      256.4184
## factor.p_raceth.nh.multirace  9.019817      145.1444

varImpPlot(bag.9, n.var = 10, main="Figure 3i: Wave 9 Importance Plot", type=1)

Stats II Final Paper

Cristina Martinez

March 10, 2021

Reshape into longitudinal format

K Means Analysis

determine optimal number of clusters

K Means

Add cluster variable to ECLSK

Plot Height Over Time

Random Forest