data(swiss)
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
#summary(fit1)
plot(swiss$Agriculture, swiss$Fertility)

scatter.smooth(swiss$Agriculture,swiss$Fertility,
lpars = list(lty=2),
xlab = "Agriculture",
ylab = "Fertility")
abline(fit1, lty = 3)
legend(65, 50, pch = c(1, NA, NA), lty = c(NA, 2:3),
c("Observations", "Loess line", "Regression line"), bty = "n")

#Data check.
fit2 <- lm(Fertility ~ Agriculture + Education + Examination, swiss)
summary(fit2)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Examination,
## data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.967 -4.978 -1.045 4.906 21.358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.80162 7.15523 13.948 < 2e-16 ***
## Agriculture -0.18017 0.08071 -2.232 0.03084 *
## Education -0.67242 0.19366 -3.472 0.00119 **
## Examination -0.79744 0.24679 -3.231 0.00237 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.601 on 43 degrees of freedom
## Multiple R-squared: 0.5568, Adjusted R-squared: 0.5259
## F-statistic: 18.01 on 3 and 43 DF, p-value: 1.017e-07
library(car)
## Loading required package: carData
#multi-colinearity issue
#if sqrt(vif) > 2, eliminate the varialbe or correlated varialbe from the model. otherwise, it is free for the multicolinearity.
vif(fit2)
## Agriculture Education Examination
## 2.089159 2.156227 2.410537
sqrt(vif(fit2))
## Agriculture Education Examination
## 1.445392 1.468410 1.552590
#outliers, high leverage, influential factor
#it is not for reject the value, it is for check.
influencePlot(fit2)

## StudRes Hat CookD
## Glane 2.70858956 0.03564719 0.059090300
## La Chauxdfnd -0.27947032 0.18617782 0.004564804
## Neuchatel 2.28076809 0.15637156 0.219592640
## V. De Geneve 0.08668819 0.45326329 0.001594313
## Rive Droite -1.91608302 0.18382560 0.194632813
#if I want to choose the point
#influencePlot(fit2, id=list(method = "identify"))
#outlier : y-axis check either s.d > 2 or sd < -2, ie 2sd : 95%
#high-leverage point : x-axis check : p/n if > 2 or 3 then check the data again. p include the intercept
#influential observation : Cook's D: k/(n - k- 1) :check the size of the circle. k means just the no of predictors
#linear model using by ols(ordinary least squares)
#nomality, linearity, covariance check.
#adjusting the normality.if not, transform with Y^(lambda)
#after transfromation, then possible to explain with linear model
plot(fit2, which = 2)

powerTransform(swiss$Fertility)
## Estimated transformation parameter
## swiss$Fertility
## 1.780718
#lambda value suggest : mostly use +-(2,1,0.5), or 0
#this case: how about Fertility^1.7807
summary(powerTransform(swiss$Fertility))
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## swiss$Fertility 1.7807 1 0.526 3.0354
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 9.001312 1 0.0026979
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 1.592577 1 0.20696
#check pvalue: if < 0.05, need to transfrom.
#no differnece before and after. no need to transform
#adjusting linearity
#if the line is curve, then need to change to linear
#change the predictor, x^2 instead of x
boxTidwell(Fertility ~ Agriculture + Education + Examination, data = swiss)
## MLE of lambda Score Statistic (z) Pr(>|z|)
## Agriculture 0.38300 0.0976 0.9223
## Education 1.53495 -1.2512 0.2109
## Examination 0.55128 1.0866 0.2772
##
## iterations = 26
#check the lambda, p-value. can try with the lambda value to have linear model, this case no need to change because not significant
#adjusting if it not covairiant.
ncvTest(fit2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.9790692 Df = 1 p = 0.3224287
#check pvalue: this case is not significant
#if it is significant: check the spreadevelPlot
spreadLevelPlot(fit2)

##
## Suggested power transformation: -0.06163745
#we can adjust to be covariant using by Suggested power transformation, we can transform with Fertility^(suggested value)
fit3 <- lm(Fertility ~. , data = swiss)
summary(fit3)
##
## Call:
## lm(formula = Fertility ~ ., data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2743 -5.2617 0.5032 4.1198 15.3213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.91518 10.70604 6.250 1.91e-07 ***
## Agriculture -0.17211 0.07030 -2.448 0.01873 *
## Examination -0.25801 0.25388 -1.016 0.31546
## Education -0.87094 0.18303 -4.758 2.43e-05 ***
## Catholic 0.10412 0.03526 2.953 0.00519 **
## Infant.Mortality 1.07705 0.38172 2.822 0.00734 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared: 0.7067, Adjusted R-squared: 0.671
## F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10
#let's exclude things not significant
fit4 <- lm(Fertility ~ Agriculture + Education + Catholic + Infant.Mortality, data = swiss)
summary(fit4)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic +
## Infant.Mortality, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6765 -6.0522 0.7514 3.1664 16.1422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.10131 9.60489 6.466 8.49e-08 ***
## Agriculture -0.15462 0.06819 -2.267 0.02857 *
## Education -0.98026 0.14814 -6.617 5.14e-08 ***
## Catholic 0.12467 0.02889 4.315 9.50e-05 ***
## Infant.Mortality 1.07844 0.38187 2.824 0.00722 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared: 0.6993, Adjusted R-squared: 0.6707
## F-statistic: 24.42 on 4 and 42 DF, p-value: 1.717e-10
#this is very tiring to check one by one.
#check the AIC(Akaite's Information Criterion)
#more accounted for or more suitable
AIC(fit3, fit4)
## df AIC
## fit3 7 326.0716
## fit4 6 325.2408
#AIC number : the small is the better
#how can we automate the regression model?
#stepwise regression(Backward stepwise, Forward stepwise reg)
#Backward stepwise regression
#full model vs reduced model : -> AIC
#full vs full - D : AIC check.
#full vs full - D - C : AIC check.
full.model = lm(Fertility ~ ., data = swiss)
reduced.model= step(full.model, direction = "backward")
## Start: AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic +
## Infant.Mortality
##
## Df Sum of Sq RSS AIC
## - Examination 1 53.03 2158.1 189.86
## <none> 2105.0 190.69
## - Agriculture 1 307.72 2412.8 195.10
## - Infant.Mortality 1 408.75 2513.8 197.03
## - Catholic 1 447.71 2552.8 197.75
## - Education 1 1162.56 3267.6 209.36
##
## Step: AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
##
## Df Sum of Sq RSS AIC
## <none> 2158.1 189.86
## - Agriculture 1 264.18 2422.2 193.29
## - Infant.Mortality 1 409.81 2567.9 196.03
## - Catholic 1 956.57 3114.6 205.10
## - Education 1 2249.97 4408.0 221.43
summary(reduced.model)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic +
## Infant.Mortality, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6765 -6.0522 0.7514 3.1664 16.1422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.10131 9.60489 6.466 8.49e-08 ***
## Agriculture -0.15462 0.06819 -2.267 0.02857 *
## Education -0.98026 0.14814 -6.617 5.14e-08 ***
## Catholic 0.12467 0.02889 4.315 9.50e-05 ***
## Infant.Mortality 1.07844 0.38187 2.824 0.00722 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared: 0.6993, Adjusted R-squared: 0.6707
## F-statistic: 24.42 on 4 and 42 DF, p-value: 1.717e-10
#Forward stepwise regression
#start model vs add.model with good p-value which means the smallest p
min.model <- lm(Fertility ~1, data = swiss)
fwd.model <- step(min.model, direction = "forward",
scope = (Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality),
trace = 0)
summary(fwd.model)
##
## Call:
## lm(formula = Fertility ~ Education + Catholic + Infant.Mortality +
## Agriculture, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6765 -6.0522 0.7514 3.1664 16.1422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.10131 9.60489 6.466 8.49e-08 ***
## Education -0.98026 0.14814 -6.617 5.14e-08 ***
## Catholic 0.12467 0.02889 4.315 9.50e-05 ***
## Infant.Mortality 1.07844 0.38187 2.824 0.00722 **
## Agriculture -0.15462 0.06819 -2.267 0.02857 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared: 0.6993, Adjusted R-squared: 0.6707
## F-statistic: 24.42 on 4 and 42 DF, p-value: 1.717e-10
#all subset regression : complemnt the AIC stepwise model
library(leaps)
leaps.model <- regsubsets(Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality,
data = swiss,
nbest = 4)
plot(leaps.model, scale= "adjr2") #using adjusted R-square

#best model : reseacher has to have the insight for the phenomena
#need to be more involved in data collecting process