brainweight <- read.csv("~/Downloads/brainweight.txt", sep="")

Use the pairs function to construct a scatterplot matrix of the logarithms of B, W, and G.

pairs(log(brainweight))

Use the cor function to determine the correlation matrix for the three (logged) variables.

cor(log(brainweight))
##               Brain      Body Gestation
## Brain     1.0000000 0.9642905 0.8912940
## Body      0.9642905 1.0000000 0.8455317
## Gestation 0.8912940 0.8455317 1.0000000

Use the lm function in R to fit the MLR model

fit = lm(log(brainweight))
summary(fit)
## 
## Call:
## lm(formula = log(brainweight))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00286 -0.30372 -0.05242  0.37851  1.58788 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.45728    0.45848  -0.997    0.321    
## Body         0.55117    0.03236  17.033   <2e-16 ***
## Gestation    0.66782    0.10875   6.141    2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4902 on 93 degrees of freedom
## Multiple R-squared:  0.9501, Adjusted R-squared:  0.949 
## F-statistic: 885.2 on 2 and 93 DF,  p-value: < 2.2e-16

Create the design/covariate matrix, X, for the model in c) and verify that the coeficient estimates in the summary output are given by the least squares formula. (X’X)-1XTy

y=log(brainweight$Brain)
n=length(y)
X=cbind(rep(1,n),log(brainweight$Body),log(brainweight$Gestation))
XtX=t(X)%*%X         
iXtX=solve(XtX)
bhat=iXtX%*%t(X)%*%y 
betas=cbind(fit$coefficients,bhat)
colnames(betas)=c("Coef in Summmary", "LS formula")
betas #I get the same coeficients.
##             Coef in Summmary LS formula
## (Intercept)       -0.4572826 -0.4572826
## Body               0.5511654  0.5511654
## Gestation          0.6678215  0.6678215

Write R code to compute the hat-matrix and use it to calculate the fitted values and residual vector.

H=X%*%iXtX%*%t(X)
fittedvalues = H%*%y
residualvector=y-fittedvalues

Determine the squared correlation between the fitted values and the response vector.

Rsquared=(cor(y,fittedvalues))^2
Rsquared #This number corresponds to the "Multiple R-squared" value in the summary output. 
##           [,1]
## [1,] 0.9500939

Compute the mean squared error and its square root using the residual vector from 1e.

modeldf=2
MSE=sum(residualvector**2/(n-modeldf-1))
sqrtMSE=sqrt(MSE)
sqrtMSE #This number corresponds to the "Residual standard error" value in the summary output
## [1] 0.4902111

Compute the standard errors for the regression coeficients using the root mean squared error and the design matrix.

Xbar2= cbind(rep(mean(X[,2]),n))
Xd2=sqrt(sum((X[,2]-Xbar2)^2))
se1=sqrtMSE/Xd2
se1 #I used the formula for determining standard error for the regression coefficients, but I must have an error because I couldn't get the same value as the summary output. 
## [1] 0.01727703
Xbar3= cbind(rep(mean(X[,3]),n))
Xd3=sqrt(sum((X[,3]-Xbar3)^2))
se2=sqrtMSE/Xd3
se2
## [1] 0.05806254

Extract the diagonal elements from the hat-matrix. Identify the observation with the most leverage. Determine the fitted value for this observation.

dH=diag(H)
b=order(dH)[n]
b #the 38th observation has the most leverage
## [1] 38
log(brainweight[b,]) 
##       Brain      Body Gestation
## 38 2.054124 -1.514128  4.976734
fittedvalues[b,]  #the fitted value is shown below
## [1] 2.031753

Refit the MLR model with the observation identified in h) omitted. Use the fitted model to predict the response for the omitted observation. Compare this to the corresponding prediction using all the data.

fit1=lm(brainweight,subset=-b) 
summary(fit1)
## 
## Call:
## lm(formula = brainweight, subset = -b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1090.79   -63.34     9.19    67.84  1023.69 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -111.41191   43.29920  -2.573   0.0117 *  
## Body           1.03123    0.09077  11.360  < 2e-16 ***
## Gestation      1.45201    0.27655   5.250 9.71e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 227.4 on 92 degrees of freedom
## Multiple R-squared:  0.8048, Adjusted R-squared:  0.8005 
## F-statistic: 189.6 on 2 and 92 DF,  p-value: < 2.2e-16
responsewith= fit$coefficients[1]+fit$coefficients[2]*log(brainweight[b,])[2]+fit$coefficients[3]*log(brainweight[b,][3])

responsewithout= fit1$coefficients[1]+fit1$coefficients[2]*log(brainweight[b,])[2]+fit1$coefficients[3]*log(brainweight[b,][3])

responses = cbind(responsewith,responsewithout)
colnames(responses)=c("resp. w/ obs. 38", "resp. w/o obs.38")
responses
##    resp. w/ obs. 38 resp. w/o obs.38
## 38         2.031753         -105.747