require(ISLR)
## Loading required package: ISLR
set.seed(1)
y=rnorm(100)
x=rnorm(100)
y=x-2*x^2+rnorm(100)
plot(x,y)
##Part C – we start by setting the random seed to 1, and converting x and y to a data frame. Then, use the leave one out cv(loocv)for cross validation. Repeat this for n times for each observation as a validation set.
library(boot)
data <- data.frame(x,y)
loocv=function(fit){
h=lm.influence(fit)$h
mean((residuals(fit)/(1-h))^2)
}
cv.error=rep(0,4)
degree=1:4
for(d in degree){
glm.fit=glm(y~poly(x,d))
cv.error[d]=loocv(glm.fit)
}
cv.error
## [1] 5.890979 1.086596 1.102585 1.114772
plot(degree,cv.error,type="b")
glm.fit.1=lm(y~poly(x,1))
glm.fit.2=lm(y~poly(x,2))
glm.fit.3=lm(y~poly(x,3))
glm.fit.4=lm(y~poly(x,4))
#we can conclude that model 2 is the best fit as it has the lowest RSE. #RSE1 - 2.36 | RSE2 = 1.032 | RSE3 = 1.037 | RSE4 = 1.041
summary(glm.fit.1)
##
## Call:
## lm(formula = y ~ poly(x, 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3469 -0.9275 0.8028 1.5608 4.3974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.2362 -7.737 9.18e-12 ***
## poly(x, 1) 2.3164 2.3622 0.981 0.329
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.362 on 98 degrees of freedom
## Multiple R-squared: 0.009717, Adjusted R-squared: -0.0003881
## F-statistic: 0.9616 on 1 and 98 DF, p-value: 0.3292
summary(glm.fit.2)
##
## Call:
## lm(formula = y ~ poly(x, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.89884 -0.53765 0.04135 0.61490 2.73607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1032 -17.704 <2e-16 ***
## poly(x, 2)1 2.3164 1.0324 2.244 0.0271 *
## poly(x, 2)2 -21.0586 1.0324 -20.399 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.032 on 97 degrees of freedom
## Multiple R-squared: 0.8128, Adjusted R-squared: 0.8089
## F-statistic: 210.6 on 2 and 97 DF, p-value: < 2.2e-16
summary(glm.fit.3)
##
## Call:
## lm(formula = y ~ poly(x, 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.87250 -0.53881 0.02862 0.59383 2.74350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1037 -17.621 <2e-16 ***
## poly(x, 3)1 2.3164 1.0372 2.233 0.0279 *
## poly(x, 3)2 -21.0586 1.0372 -20.302 <2e-16 ***
## poly(x, 3)3 -0.3048 1.0372 -0.294 0.7695
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.037 on 96 degrees of freedom
## Multiple R-squared: 0.813, Adjusted R-squared: 0.8071
## F-statistic: 139.1 on 3 and 96 DF, p-value: < 2.2e-16
summary(glm.fit.4)
##
## Call:
## lm(formula = y ~ poly(x, 4))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8914 -0.5244 0.0749 0.5932 2.7796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1041 -17.549 <2e-16 ***
## poly(x, 4)1 2.3164 1.0415 2.224 0.0285 *
## poly(x, 4)2 -21.0586 1.0415 -20.220 <2e-16 ***
## poly(x, 4)3 -0.3048 1.0415 -0.293 0.7704
## poly(x, 4)4 -0.4926 1.0415 -0.473 0.6373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.041 on 95 degrees of freedom
## Multiple R-squared: 0.8134, Adjusted R-squared: 0.8055
## F-statistic: 103.5 on 4 and 95 DF, p-value: < 2.2e-16
#medv stands for median value of owner-occupied homes in $100s.
library(MASS)
attach(Boston)
mu.hat <- mean(medv)
mu.hat
## [1] 22.53281
NumberOfObservations = dim(Boston)[1]
NumberOfObservations
## [1] 506
S.d.Hat.Mu.Hat = sd(medv)/(NumberOfObservations ^ .5)
S.d.Hat.Mu.Hat
## [1] 0.4088611
require(boot)
muHat.fn=function(data, index){
X = data$medv[index]
return(mean(X))
}
boot.out=boot(Boston,muHat.fn,R=1000)
boot.out
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = muHat.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007654348 0.4094899
#Part E – Muhat.med stands for the median value of muHat, which is 21.2.
MuHat.Med = median(medv)
MuHat.Med
## [1] 21.2
#Part F – estimate the standard error of the median using the bootstrap method, giving a estimated value of 12.75 and a std error of 0.491.
muHat.Med.fn=function(data, index){
X = data$medv[index]
return(median(X))
}
boot.out=boot(Boston,muHat.Med.fn,R=1000)
boot.out
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = muHat.Med.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0399 0.3771209
?quantile
quantile(medv,0.1)
## 10%
## 12.75