part – A

We use the set.seed function to create random objects that can be used for producing diffrent smples. Hence, both x and y are vectors that normally disterbuted random numbers.

require(ISLR)
## Loading required package: ISLR
set.seed(1)
y=rnorm(100)
x=rnorm(100)
y=x-2*x^2+rnorm(100)

Part B – Scatter plot for x and y: The plot suggests that there is a strong non-linear relationship between x and y. Meaning that between the two entities in which change in x does not correspond with constant change in y.

plot(x,y)

##Part C – we start by setting the random seed to 1, and converting x and y to a data frame. Then, use the leave one out cv(loocv)for cross validation. Repeat this for n times for each observation as a validation set.

library(boot)

data <- data.frame(x,y)


loocv=function(fit){
  h=lm.influence(fit)$h
  mean((residuals(fit)/(1-h))^2)
}

cv.error=rep(0,4)
degree=1:4
for(d in degree){
  glm.fit=glm(y~poly(x,d))
  cv.error[d]=loocv(glm.fit)
}
cv.error 
## [1] 5.890979 1.086596 1.102585 1.114772
plot(degree,cv.error,type="b") 

now we run 4 linear models with 1 added coefficent with each model.

glm.fit.1=lm(y~poly(x,1))
glm.fit.2=lm(y~poly(x,2))
glm.fit.3=lm(y~poly(x,3))
glm.fit.4=lm(y~poly(x,4))

Part E & f – looking at the residual standard errors,

#we can conclude that model 2 is the best fit as it has the lowest RSE. #RSE1 - 2.36 | RSE2 = 1.032 | RSE3 = 1.037 | RSE4 = 1.041

summary(glm.fit.1)
## 
## Call:
## lm(formula = y ~ poly(x, 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3469 -0.9275  0.8028  1.5608  4.3974 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8277     0.2362  -7.737 9.18e-12 ***
## poly(x, 1)    2.3164     2.3622   0.981    0.329    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.362 on 98 degrees of freedom
## Multiple R-squared:  0.009717,   Adjusted R-squared:  -0.0003881 
## F-statistic: 0.9616 on 1 and 98 DF,  p-value: 0.3292
summary(glm.fit.2)
## 
## Call:
## lm(formula = y ~ poly(x, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.89884 -0.53765  0.04135  0.61490  2.73607 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8277     0.1032 -17.704   <2e-16 ***
## poly(x, 2)1   2.3164     1.0324   2.244   0.0271 *  
## poly(x, 2)2 -21.0586     1.0324 -20.399   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.032 on 97 degrees of freedom
## Multiple R-squared:  0.8128, Adjusted R-squared:  0.8089 
## F-statistic: 210.6 on 2 and 97 DF,  p-value: < 2.2e-16
summary(glm.fit.3)
## 
## Call:
## lm(formula = y ~ poly(x, 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87250 -0.53881  0.02862  0.59383  2.74350 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8277     0.1037 -17.621   <2e-16 ***
## poly(x, 3)1   2.3164     1.0372   2.233   0.0279 *  
## poly(x, 3)2 -21.0586     1.0372 -20.302   <2e-16 ***
## poly(x, 3)3  -0.3048     1.0372  -0.294   0.7695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.037 on 96 degrees of freedom
## Multiple R-squared:  0.813,  Adjusted R-squared:  0.8071 
## F-statistic: 139.1 on 3 and 96 DF,  p-value: < 2.2e-16
summary(glm.fit.4)
## 
## Call:
## lm(formula = y ~ poly(x, 4))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8914 -0.5244  0.0749  0.5932  2.7796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8277     0.1041 -17.549   <2e-16 ***
## poly(x, 4)1   2.3164     1.0415   2.224   0.0285 *  
## poly(x, 4)2 -21.0586     1.0415 -20.220   <2e-16 ***
## poly(x, 4)3  -0.3048     1.0415  -0.293   0.7704    
## poly(x, 4)4  -0.4926     1.0415  -0.473   0.6373    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.041 on 95 degrees of freedom
## Multiple R-squared:  0.8134, Adjusted R-squared:  0.8055 
## F-statistic: 103.5 on 4 and 95 DF,  p-value: < 2.2e-16

Bootstrap Techniques

part A – We start off by calling the MASS package and attaching the boston data.

#medv stands for median value of owner-occupied homes in $100s.

library(MASS)
attach(Boston)
mu.hat <- mean(medv)
mu.hat
## [1] 22.53281

Part B – We get the number of observations by using the dim(dimensions) code, getting 506. Then we compute the standard error of the sample mean by dividing the sample standard deviation by the square root of the number of observations, getting 0.409 (mu.Hat).

NumberOfObservations = dim(Boston)[1]
NumberOfObservations 
## [1] 506
S.d.Hat.Mu.Hat = sd(medv)/(NumberOfObservations ^ .5)
S.d.Hat.Mu.Hat
## [1] 0.4088611

Part C – now we estimate the standard error of μ using the bootstrap method, The bootstrap estimated standard error of μ̂ of 0.4119 is very close to the estimate found in (b) of 0.4089 (mu Hat).

require(boot)
muHat.fn=function(data, index){
  X = data$medv[index]
  return(mean(X))
}

boot.out=boot(Boston,muHat.fn,R=1000)
boot.out 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = muHat.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007654348   0.4094899

#Part E – Muhat.med stands for the median value of muHat, which is 21.2.

MuHat.Med = median(medv)
MuHat.Med 
## [1] 21.2

#Part F – estimate the standard error of the median using the bootstrap method, giving a estimated value of 12.75 and a std error of 0.491.

muHat.Med.fn=function(data, index){
  X = data$medv[index]
  return(median(X))
}
boot.out=boot(Boston,muHat.Med.fn,R=1000)
boot.out 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = muHat.Med.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0399   0.3771209
?quantile
quantile(medv,0.1) 
##   10% 
## 12.75