oxboys <- read.csv("oxboys.csv")
attach(oxboys)
#Plot the relationship between height and year, and draw a linearly regressed line ignoring the id variable
HeightAvg <- lm(height~year)
plot(height~year)
abline(HeightAvg$coef[1],HeightAvg$coef[2],col=2,lwd=2)
library(ggplot2)
ggplot(oxboys, aes(x=year, y=height, group=id)) + stat_smooth(method="lm")+ geom_point()
oxboys.train <- subset(oxboys, year < 3)
oxboys.test <- subset(oxboys, year >= 3)
#unique(oxboys$year)
#Global Model
global <- lm(height ~ year, data = oxboys.train)
#summary(global)
global.pred <- predict(global,oxboys.test)
glob.mse <- mean((global.pred - oxboys.test$height)^2)
cat("The MSE for global model is",glob.mse)
## The MSE for global model is 68.73458
loc.mse.list = list()
#Local Model
for( key in unique(oxboys.train$id)){
oxboys.train.ind <- subset(oxboys.train, id==key)
oxboys.test.ind <- subset(oxboys.test, id==key)
# build a linear model with oxboys.train.ind
# predict on oxboys.test.ind, and measure MSE
local <- lm(height ~ year, data = oxboys.train.ind)
#summary(global)
local.pred <- predict(local,oxboys.test.ind)
loc.mse <- mean((local.pred - oxboys.test.ind$height)^2)
loc.mse.list[[key]] = loc.mse
#print (c(key,loc.mse))
}
cat("The MSE for local model for each child is")
## The MSE for local model for each child is
loc.mse.list
## [[1]]
## [1] 29.45143
##
## [[2]]
## [1] 15.84143
##
## [[3]]
## [1] 16.55714
##
## [[4]]
## [1] 11.53143
##
## [[5]]
## [1] 0.6052571
##
## [[6]]
## [1] 3.224286
##
## [[7]]
## [1] 0.9
##
## [[8]]
## [1] 3.587729
##
## [[9]]
## [1] 0.4642857
##
## [[10]]
## [1] 29.23173
##
## [[11]]
## [1] 14.50571
##
## [[12]]
## [1] 0.3342857
##
## [[13]]
## [1] 32.17
##
## [[14]]
## [1] 6.702857
##
## [[15]]
## [1] 0.3720143
##
## [[16]]
## [1] 24.31714
##
## [[17]]
## [1] 5.382857
##
## [[18]]
## [1] 17.26429
##
## [[19]]
## [1] 34.11857
##
## [[20]]
## [1] 14.09429
##
## [[21]]
## [1] 9.972129
##
## [[22]]
## [1] 7.658571
##
## [[23]]
## [1] 1.595714
##
## [[24]]
## [1] 35.21714
##
## [[25]]
## [1] 0.7428571
##
## [[26]]
## [1] 12.35857
#Multimodel
library(nlme)
mlm.obj <- lme(height~year, data=oxboys.train, random=list(id=pdDiag(~year)))
mlm.pred <- predict(mlm.obj, newdata=oxboys.test, level=1)
# calculate MSE
mlm.mse <- mean((mlm.pred - oxboys.test$height)^2)
cat("The MSE for multilayer model is",mlm.mse)
## The MSE for multilayer model is 4.234867
oxboys.train <- subset(oxboys, year <= 6)
oxboys.test <- subset(oxboys, year > 6)
#unique(oxboys.test$year)
#Global Model
global <- lm(height ~ year, data = oxboys.train)
#summary(global)
global.pred <- predict(global,oxboys.test)
glob.mse <- mean((global.pred - oxboys.test$height)^2)
cat("The MSE for global model is",glob.mse)
## The MSE for global model is 80.06475
loc.mse.list.6 = list()
#Local Model
for( key in unique(oxboys.train$id)){
oxboys.train.ind <- subset(oxboys.train, id==key)
oxboys.test.ind <- subset(oxboys.test, id==key)
# build a linear model with oxboys.train.ind
# predict on oxboys.test.ind, and measure MSE
local <- lm(height ~ year, data = oxboys.train.ind)
#summary(global)
local.pred <- predict(local,oxboys.test.ind)
loc.mse <- mean((local.pred - oxboys.test.ind$height)^2)
loc.mse.list.6[[key]] = loc.mse
#print (c(key,loc.mse))
}
cat("The MSE for local model for each child is")
## The MSE for local model for each child is
loc.mse.list.6
## [[1]]
## [1] 0.1229778
##
## [[2]]
## [1] 0.2713088
##
## [[3]]
## [1] 1.643045
##
## [[4]]
## [1] 0.8346104
##
## [[5]]
## [1] 4.409226
##
## [[6]]
## [1] 0.9661224
##
## [[7]]
## [1] 0.9508844
##
## [[8]]
## [1] 0.4189132
##
## [[9]]
## [1] 3.104649
##
## [[10]]
## [1] 1.2579
##
## [[11]]
## [1] 3.118141
##
## [[12]]
## [1] 0.6629279
##
## [[13]]
## [1] 10.10042
##
## [[14]]
## [1] 3.396239
##
## [[15]]
## [1] 1.242047
##
## [[16]]
## [1] 0.03240091
##
## [[17]]
## [1] 2.679385
##
## [[18]]
## [1] 0.1837415
##
## [[19]]
## [1] 10.2312
##
## [[20]]
## [1] 0.6678005
##
## [[21]]
## [1] 5.235413
##
## [[22]]
## [1] 2.25129
##
## [[23]]
## [1] 1.599921
##
## [[24]]
## [1] 10.94954
##
## [[25]]
## [1] 1.823607
##
## [[26]]
## [1] 0.5588227
#Multimodel
#library(nlme)
mlm.obj <- lme(height~year, data=oxboys.train, random=list(id=pdDiag(~year)))
mlm.pred <- predict(mlm.obj, newdata=oxboys.test, level=1)
# calculate MSE
mlm.mse <- mean((mlm.pred - oxboys.test$height)^2)
cat("The MSE for multilayer model is",mlm.mse)
## The MSE for multilayer model is 2.720906
Global model performs better in case of a smaller training set since larger the test set, larger the residuals in the kind of dataset we have. Multilayer model performs better with larger training set. It would not perform very well, when there are lesser nested layers.
B00 is the overall intercept. Its the mean intercept when predictors in all nested layers are equal to zero.
and B10 is the overall regression coefficient between the dependent variable and level 1 predictor
Comment on differences between (a) and (b)
Plot (a) shows the regression curve fitting for the entire dataset as it is. Since height increases with age and tall kids have a tendency to grow taller and faster than relatively shorter kids, a simple linear regression may not be the most optimum solution to determine the height of a child based at a later age. Plot (b) instead looks into the best fit line for different children at different ages and can predict the height of a child much more accurately as he grows older.