Question (1)

load("C:/Users/Sima/Desktop/Regression/hw4p1.RData")

# Part a) Fit the correct model and check residuals: Assumptions correct

plot(dat1)

fit1=lm(y~x1+x2,data=dat1)

summary(fit1)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -149.155  -13.544    9.408   19.619   36.950 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  337.4580    41.1994   8.191 1.05e-12 ***
## x1             8.0581     0.5566  14.478  < 2e-16 ***
## x2          -101.0442     1.8601 -54.321  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.51 on 97 degrees of freedom
## Multiple R-squared:  0.9685, Adjusted R-squared:  0.9679 
## F-statistic:  1492 on 2 and 97 DF,  p-value: < 2.2e-16
plot(fit1)

library(gvlma)

gvmodel <- gvlma(fit1) 

summary(gvmodel)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -149.155  -13.544    9.408   19.619   36.950 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  337.4580    41.1994   8.191 1.05e-12 ***
## x1             8.0581     0.5566  14.478  < 2e-16 ***
## x2          -101.0442     1.8601 -54.321  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.51 on 97 degrees of freedom
## Multiple R-squared:  0.9685, Adjusted R-squared:  0.9679 
## F-statistic:  1492 on 2 and 97 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit1) 
## 
##                      Value   p-value                   Decision
## Global Stat        248.971 0.000e+00 Assumptions NOT satisfied!
## Skewness            65.686 5.551e-16 Assumptions NOT satisfied!
## Kurtosis           114.756 0.000e+00 Assumptions NOT satisfied!
## Link Function       62.366 2.887e-15 Assumptions NOT satisfied!
## Heteroscedasticity   6.164 1.304e-02 Assumptions NOT satisfied!
# The are Null plots.The more similar residual plots looks to these, the better

# Fit the correct model and check residuals: Assumptions Incorrect

fit2=lm(y~x1+x2+I(x2^2),data=dat1)

summary(fit2)
## 
## Call:
## lm(formula = y ~ x1 + x2 + I(x2^2), data = dat1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.982  -9.470   1.723  10.026  24.820 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 219.6997    18.4444   11.91   <2e-16 ***
## x1            7.7676     0.2377   32.68   <2e-16 ***
## x2          -37.9525     3.1183  -12.17   <2e-16 ***
## I(x2^2)      -6.2923     0.3008  -20.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.43 on 96 degrees of freedom
## Multiple R-squared:  0.9943, Adjusted R-squared:  0.9942 
## F-statistic:  5620 on 3 and 96 DF,  p-value: < 2.2e-16
plot(fit2)

# The estimate for the quadratic term is near 0, so our assumptions are not horribly violated here

fit3=lm(y~x1+I(x2^2),data=dat1)

summary(fit3)
## 
## Call:
## lm(formula = y ~ x1 + I(x2^2), data = dat1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.621 -15.021  -2.051  10.160  89.443 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 150.3572    27.8303   5.403 4.69e-07 ***
## x1            7.4861     0.3753  19.947  < 2e-16 ***
## I(x2^2)      -9.8326     0.1213 -81.027  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.31 on 97 degrees of freedom
## Multiple R-squared:  0.9856, Adjusted R-squared:  0.9853 
## F-statistic:  3320 on 2 and 97 DF,  p-value: < 2.2e-16
plot(fit3)

# This time, the lack of a linear term causes our linearity assumption to be invalid,
# resulting in clear violations in the first plot

# Part b

fit2=lm(y~x1+x2+x2^2+x1:x2,data=dat1)

summary(fit2)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x2^2 + x1:x2, data = dat1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -178.28  -10.22    4.01   15.65   34.16 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -141.74687  108.39146  -1.308    0.194    
## x1            14.53134    1.46421   9.924  < 2e-16 ***
## x2             0.08339   21.54103   0.004    0.997    
## x1:x2         -1.35986    0.28877  -4.709 8.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.55 on 96 degrees of freedom
## Multiple R-squared:  0.9744, Adjusted R-squared:  0.9736 
## F-statistic:  1219 on 3 and 96 DF,  p-value: < 2.2e-16
plot(fit2)

Question (2)

# Part a

library(alr3)
## Warning: package 'alr3' was built under R version 3.3.3
## Loading required package: car
## Warning: package 'car' was built under R version 3.3.3
d= BGSall



fit2a=lm(Soma~WT9+ST9+HT9+LG9+Sex,d=BGSall)

summary(fit2a)
## 
## Call:
## lm(formula = Soma ~ WT9 + ST9 + HT9 + LG9 + Sex, data = BGSall)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9929 -0.6606  0.0397  0.4927  3.5740 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.594740   3.411741   2.226  0.02773 *  
## WT9          0.119604   0.043535   2.747  0.00686 ** 
## ST9         -0.022957   0.007376  -3.112  0.00228 ** 
## HT9         -0.056858   0.023985  -2.371  0.01923 *  
## LG9          0.037143   0.095460   0.389  0.69784    
## Sex          1.438481   0.187718   7.663  3.7e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.016 on 130 degrees of freedom
## Multiple R-squared:  0.5169, Adjusted R-squared:  0.4983 
## F-statistic: 27.82 on 5 and 130 DF,  p-value: < 2.2e-16
plot(fit2a)

# Part b

fit2b=lm(Soma~WT18+ST18+HT18+LG18+Sex,d=BGSall)

summary(fit2b)
## 
## Call:
## lm(formula = Soma ~ WT18 + ST18 + HT18 + LG18 + Sex, data = BGSall)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13147 -0.43226  0.05707  0.35977  2.66561 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.038160   2.411753   4.577 1.09e-05 ***
## WT18         0.103177   0.014236   7.248 3.36e-11 ***
## ST18        -0.015060   0.003061  -4.919 2.58e-06 ***
## HT18        -0.069241   0.012424  -5.573 1.38e-07 ***
## LG18         0.010741   0.048736   0.220   0.8259    
## Sex          0.601937   0.303077   1.986   0.0491 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7797 on 130 degrees of freedom
## Multiple R-squared:  0.7158, Adjusted R-squared:  0.7048 
## F-statistic: 65.47 on 5 and 130 DF,  p-value: < 2.2e-16
plot(fit2b)