Regression Line and Diagnostics

data(swiss)
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
#summary(fit1)
plot(swiss$Agriculture, swiss$Fertility)

scatter.smooth(swiss$Agriculture,swiss$Fertility, 
               lpars = list(lty=2),
               xlab = "Agriculture",
               ylab = "Fertility")
abline(fit1, lty = 3)
legend(65, 50, pch = c(1, NA, NA), lty = c(NA, 2:3),
       c("Observations", "Loess line", "Regression line"), bty = "n")

#Data check.
fit2 <- lm(Fertility ~ Agriculture + Education + Examination, swiss)
summary(fit2)

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Examination, 
##     data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.967  -4.978  -1.045   4.906  21.358 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 99.80162    7.15523  13.948  < 2e-16 ***
## Agriculture -0.18017    0.08071  -2.232  0.03084 *  
## Education   -0.67242    0.19366  -3.472  0.00119 ** 
## Examination -0.79744    0.24679  -3.231  0.00237 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.601 on 43 degrees of freedom
## Multiple R-squared:  0.5568, Adjusted R-squared:  0.5259 
## F-statistic: 18.01 on 3 and 43 DF,  p-value: 1.017e-07

library(car)

## Loading required package: carData

#multi-colinearity issue
#if sqrt(vif) > 2, eliminate the varialbe or correlated varialbe from the model. otherwise, it is free for the multicolinearity.

vif(fit2)

## Agriculture   Education Examination 
##    2.089159    2.156227    2.410537

sqrt(vif(fit2))

## Agriculture   Education Examination 
##    1.445392    1.468410    1.552590

#outliers, high leverage, influential factor
#it is not for reject the value, it is for check.

influencePlot(fit2)

##                  StudRes        Hat       CookD
## Glane         2.70858956 0.03564719 0.059090300
## La Chauxdfnd -0.27947032 0.18617782 0.004564804
## Neuchatel     2.28076809 0.15637156 0.219592640
## V. De Geneve  0.08668819 0.45326329 0.001594313
## Rive Droite  -1.91608302 0.18382560 0.194632813

#if I want to choose the point
#influencePlot(fit2, id=list(method = "identify"))
#outlier : y-axis check either s.d > 2 or sd < -2, ie 2sd : 95%
#high-leverage point : x-axis check : p/n if > 2 or 3 then check the data again. p include the intercept
#influential observation : Cook's D: k/(n - k- 1) :check the size of the circle. k means just the no of predictors

#linear model using by ols(ordinary least squares)
#nomality, linearity, covariance check.

#adjusting the normality.if not, transform with Y^(lambda)
#after transfromation, then possible to explain with linear model
plot(fit2, which = 2)

powerTransform(swiss$Fertility)

## Estimated transformation parameter 
## swiss$Fertility 
##        1.780718

#lambda value suggest : mostly use +-(2,1,0.5), or 0
#this case: how about Fertility^1.7807

summary(powerTransform(swiss$Fertility))

## bcPower Transformation to Normality 
##                 Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## swiss$Fertility    1.7807           1        0.526       3.0354
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                            LRT df      pval
## LR test, lambda = (0) 9.001312  1 0.0026979
## 
## Likelihood ratio test that no transformation is needed
##                            LRT df    pval
## LR test, lambda = (1) 1.592577  1 0.20696

#check pvalue: if < 0.05, need to transfrom. 
#no differnece before and after. no need to transform

#adjusting linearity
#if the line is curve, then need to change to linear
#change the predictor, x^2 instead of x

boxTidwell(Fertility ~ Agriculture + Education + Examination, data = swiss)

##             MLE of lambda Score Statistic (z) Pr(>|z|)
## Agriculture       0.38300              0.0976   0.9223
## Education         1.53495             -1.2512   0.2109
## Examination       0.55128              1.0866   0.2772
## 
## iterations =  26

#check the lambda, p-value. can try with the lambda value to have linear model, this case no need to change because not significant


#adjusting if it not covairiant.
ncvTest(fit2)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.9790692    Df = 1     p = 0.3224287

#check pvalue: this case is not significant
#if it is significant: check the spreadevelPlot

spreadLevelPlot(fit2)

## 
## Suggested power transformation:  -0.06163745

#we can adjust to be covariant using by Suggested power transformation, we can transform with Fertility^(suggested value)

fit3 <- lm(Fertility ~. , data = swiss)
summary(fit3)

## 
## Call:
## lm(formula = Fertility ~ ., data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2743  -5.2617   0.5032   4.1198  15.3213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
## Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
## Examination      -0.25801    0.25388  -1.016  0.31546    
## Education        -0.87094    0.18303  -4.758 2.43e-05 ***
## Catholic          0.10412    0.03526   2.953  0.00519 ** 
## Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.671 
## F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10

#let's exclude things not significant

fit4 <- lm(Fertility ~ Agriculture + Education + Catholic + Infant.Mortality, data = swiss)
summary(fit4)

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6765  -6.0522   0.7514   3.1664  16.1422 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
## Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
## Education        -0.98026    0.14814  -6.617 5.14e-08 ***
## Catholic          0.12467    0.02889   4.315 9.50e-05 ***
## Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared:  0.6993, Adjusted R-squared:  0.6707 
## F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10

#this is very tiring to check one by one.
#check the AIC(Akaite's Information Criterion)
#more accounted for or more suitable
AIC(fit3, fit4)

##      df      AIC
## fit3  7 326.0716
## fit4  6 325.2408

#AIC number : the small is the better

#how can we automate the regression model?
#stepwise regression(Backward stepwise, Forward stepwise reg)
#Backward stepwise regression
#full model vs reduced model : -> AIC
#full vs full - D : AIC check.
#full vs full - D - C : AIC check.

full.model = lm(Fertility ~ ., data = swiss)
reduced.model= step(full.model, direction = "backward")

## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43

summary(reduced.model)

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6765  -6.0522   0.7514   3.1664  16.1422 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
## Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
## Education        -0.98026    0.14814  -6.617 5.14e-08 ***
## Catholic          0.12467    0.02889   4.315 9.50e-05 ***
## Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared:  0.6993, Adjusted R-squared:  0.6707 
## F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10

#Forward stepwise regression
#start model vs add.model with good p-value which means the smallest p 
min.model <- lm(Fertility ~1, data = swiss)
fwd.model <- step(min.model, direction = "forward", 
                  scope = (Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality),
                  trace = 0)
summary(fwd.model)

## 
## Call:
## lm(formula = Fertility ~ Education + Catholic + Infant.Mortality + 
##     Agriculture, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6765  -6.0522   0.7514   3.1664  16.1422 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
## Education        -0.98026    0.14814  -6.617 5.14e-08 ***
## Catholic          0.12467    0.02889   4.315 9.50e-05 ***
## Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
## Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared:  0.6993, Adjusted R-squared:  0.6707 
## F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10

#all subset regression : complemnt the AIC stepwise model
library(leaps)
leaps.model <- regsubsets(Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality, 
                        data = swiss,
                        nbest = 4)
plot(leaps.model, scale= "adjr2") #using adjusted R-square

#best model : reseacher has to have the insight for the phenomena
#need to be more involved in data collecting process

Regression Line and Diagnostics

Ellie Kim

2018-04-24