library(tidyverse)
8.2.2. Simple linear regression
fit <- lm(weight ~ height, data = women)
summary(fit)
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
women$weight
## [1] 115 117 120 123 126 129 132 135 139 142 146 150 154 159 164
fitted(fit)
## 1 2 3 4 5 6 7 8
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333
## 9 10 11 12 13 14 15
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
residuals(fit)
## 1 2 3 4 5 6
## 2.41666667 0.96666667 0.51666667 0.06666667 -0.38333333 -0.83333333
## 7 8 9 10 11 12
## -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333
## 13 14 15
## 0.01666667 1.56666667 3.11666667
plot(women$height,women$weight,
xlab = "Height (in inches)",
ylab = "Weight (in pounds")
abline(fit)
8.2.3. Polynomial regression
fit2 <- lm(weight ~ height + I(height^2), data = women)
summary(fit2)
##
## Call:
## lm(formula = weight ~ height + I(height^2), data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50941 -0.29611 -0.00941 0.28615 0.59706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 261.87818 25.19677 10.393 2.36e-07 ***
## height -7.34832 0.77769 -9.449 6.58e-07 ***
## I(height^2) 0.08306 0.00598 13.891 9.32e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3841 on 12 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9994
## F-statistic: 1.139e+04 on 2 and 12 DF, p-value: < 2.2e-16
plot(women$height, women$weight,
xlab = "Height (in inches)",
ylab = "Weight (in lbs)")
lines(women$height,fitted(fit2))
fit3 <- lm(weight ~ height + I(height^2) + I(height^3), data = women)
summary(fit2)
##
## Call:
## lm(formula = weight ~ height + I(height^2), data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50941 -0.29611 -0.00941 0.28615 0.59706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 261.87818 25.19677 10.393 2.36e-07 ***
## height -7.34832 0.77769 -9.449 6.58e-07 ***
## I(height^2) 0.08306 0.00598 13.891 9.32e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3841 on 12 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9994
## F-statistic: 1.139e+04 on 2 and 12 DF, p-value: < 2.2e-16
plot(women$height, women$weight,
xlab = "Height (in inches)",
ylab = "Weight (in lbs)")
lines(women$height,fitted(fit3))
library(car)
scatterplot(weight ~ height, data = women,
spread = FALSE, smoother.args=list(lty=2), pch=19,
main = "Women Age 30-39",
xlab = "Height (inches)",
ylab = "Weight (lbs.)")
8.2.4. Multiple linear regression
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
8.3. Examining bivariate relationships
cor(states)
## Murder Population Illiteracy Income Frost
## Murder 1.0000000 0.3436428 0.7029752 -0.2300776 -0.5388834
## Population 0.3436428 1.0000000 0.1076224 0.2082276 -0.3321525
## Illiteracy 0.7029752 0.1076224 1.0000000 -0.4370752 -0.6719470
## Income -0.2300776 0.2082276 -0.4370752 1.0000000 0.2262822
## Frost -0.5388834 -0.3321525 -0.6719470 0.2262822 1.0000000
library(car)
scatterplotMatrix(states, spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix")
8.4. Multiple linear regression
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
summary(fit)
##
## Call:
## lm(formula = Murder ~ Population + Illiteracy + Income + Frost,
## data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7960 -1.6495 -0.0811 1.4815 7.6210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.235e+00 3.866e+00 0.319 0.7510
## Population 2.237e-04 9.052e-05 2.471 0.0173 *
## Illiteracy 4.143e+00 8.744e-01 4.738 2.19e-05 ***
## Income 6.442e-05 6.837e-04 0.094 0.9253
## Frost 5.813e-04 1.005e-02 0.058 0.9541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.535 on 45 degrees of freedom
## Multiple R-squared: 0.567, Adjusted R-squared: 0.5285
## F-statistic: 14.73 on 4 and 45 DF, p-value: 9.133e-08
8.2.5. Multiple linear regression with interactions
fit <- lm(mpg ~ hp + wt + hp:wt, data = mtcars)
summary(fit)
##
## Call:
## lm(formula = mpg ~ hp + wt + hp:wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0632 -1.6491 -0.7362 1.4211 4.5513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
## hp -0.12010 0.02470 -4.863 4.04e-05 ***
## wt -8.21662 1.26971 -6.471 5.20e-07 ***
## hp:wt 0.02785 0.00742 3.753 0.000811 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared: 0.8848, Adjusted R-squared: 0.8724
## F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
library(effects)
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
plot(effect("hp:wt", fit,, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE)
8.3. Regression diagnostics
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
confint(fit)
## 2.5 % 97.5 %
## (Intercept) -6.552191e+00 9.0213182149
## Population 4.136397e-05 0.0004059867
## Illiteracy 2.381799e+00 5.9038743192
## Income -1.312611e-03 0.0014414600
## Frost -1.966781e-02 0.0208304170
fit <- lm(weight ~ height, data = women)
par(mfrow=c(2,2))
plot(fit)
fit2 <- lm(weight ~ height + I(height^2), data = women)
par(mfrow=c(2,2))
plot(fit2)
newfit <- lm(weight ~ height + I(height^2), data = women[-c(13,15),])
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
par(mfrow=c(2,2))
plot(fit)
8.3.2. An enhanced approach
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
qqPlot(fit, labels = row.names(states), id.method="identify",
simulate=TRUE, main="Q-Q Plot")
## Nevada Rhode Island
## 28 39
states["Nevada",]
## Murder Population Illiteracy Income Frost
## Nevada 11.5 590 0.5 5149 188
fitted(fit)["Nevada"]
## Nevada
## 3.878958
residuals(fit)["Nevada"]
## Nevada
## 7.621042
rstudent(fit)["Nevada"]
## Nevada
## 3.542929
8.6. Function for plotting studentized residuals
residplot <- function(fit, nbreaks=10) {
z <- rstudent(fit)
hist(z, breaks = nbreaks, freq = FALSE,
xlab = "Studentized Residual",
main = "Distribution of Errors")
rug(jitter(z), col = "brown")
curve(dnorm(x, mean = mean(z), sd = sd(z)),
add = TRUE, col = "blue", lwd=2)
lines(density(z)$x, density(z)$y,
col="red", lwd=2, lty=2)
legend("topright",
legend = c( "Normal Curve", "Kernel Density Curve"),
lty = 1:2, col = c("blue", "red"), cex = .7)
}
residplot(fit)
Independence of Errors
durbinWatsonTest(fit)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.2006929 2.317691 0.24
## Alternative hypothesis: rho != 0
Linearity
crPlots(fit)
Homoscedasticity 8.7. Assessing homoscedasticity
library(car)
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.746514, Df = 1, p = 0.18632
spreadLevelPlot(fit)
##
## Suggested power transformation: 1.209626
8.8. Global test of linear model assumptions
library(gvlma)
gvmodel <- gvlma(fit)
summary(gvmodel)
##
## Call:
## lm(formula = Murder ~ Population + Illiteracy + Income + Frost,
## data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7960 -1.6495 -0.0811 1.4815 7.6210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.235e+00 3.866e+00 0.319 0.7510
## Population 2.237e-04 9.052e-05 2.471 0.0173 *
## Illiteracy 4.143e+00 8.744e-01 4.738 2.19e-05 ***
## Income 6.442e-05 6.837e-04 0.094 0.9253
## Frost 5.813e-04 1.005e-02 0.058 0.9541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.535 on 45 degrees of freedom
## Multiple R-squared: 0.567, Adjusted R-squared: 0.5285
## F-statistic: 14.73 on 4 and 45 DF, p-value: 9.133e-08
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fit)
##
## Value p-value Decision
## Global Stat 2.7728 0.5965 Assumptions acceptable.
## Skewness 1.5374 0.2150 Assumptions acceptable.
## Kurtosis 0.6376 0.4246 Assumptions acceptable.
## Link Function 0.1154 0.7341 Assumptions acceptable.
## Heteroscedasticity 0.4824 0.4873 Assumptions acceptable.
8.3.4. Multicollinearity 8.9. Evaluating multicollinearity
library(car)
vif(fit)
## Population Illiteracy Income Frost
## 1.245282 2.165848 1.345822 2.082547
sqrt(vif(fit)) > 2
## Population Illiteracy Income Frost
## FALSE FALSE FALSE FALSE
8.4. Unusual observations 8.4.1. Outliers
outlierTest(fit)
## rstudent unadjusted p-value Bonferonni p
## Nevada 3.542929 0.00095088 0.047544
8.4.2. High-leverage points
hat.plot <- function(fit) {
p <- length(coefficients(fit))
n <- length(fitted(fit))
plot(hatvalues(fit), main = "Index Plot of Hat Values")
abline(h=c(2,3)*p/n, col="red", lty=2)
identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)
## integer(0)
8.4.3. Influential observations
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels = cutoff)
abline(h=cutoff, lty=2, col="red")
avPlots(fit, ask = FALSE, id.methods="identify")
influencePlot(fit, id.method = "identify", main = "Influence Plot",
sub = "Circle size is proportional to Cook's distance")
## StudRes Hat CookD
## Alaska 1.7536917 0.43247319 0.448050997
## California -0.2761492 0.34087628 0.008052956
## Nevada 3.5429286 0.09508977 0.209915743
## Rhode Island -2.0001631 0.04562377 0.035858963
8.5. Corrective measures 8.10. Box–Cox transformation to normality
summary(powerTransform(states$Murder))
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## states$Murder 0.6055 1 0.0884 1.1227
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 5.665991 1 0.017297
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 2.122763 1 0.14512
boxTidwell(Murder~Population+Illiteracy, data = states)
## MLE of lambda Score Statistic (z) Pr(>|z|)
## Population 0.86939 -0.3228 0.7468
## Illiteracy 1.35812 0.6194 0.5357
##
## iterations = 19
8.11. Comparing nested models using the anova() function
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data = states)
fit2 <- lm(Murder ~ Population + Illiteracy,
data = states)
anova(fit2, fit1)
## Analysis of Variance Table
##
## Model 1: Murder ~ Population + Illiteracy
## Model 2: Murder ~ Population + Illiteracy + Income + Frost
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 47 289.25
## 2 45 289.17 2 0.078505 0.0061 0.9939
8.12. Comparing models with the AIC
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data = states)
fit2 <- lm(Murder ~ Population + Illiteracy,
data = states)
AIC(fit1, fit2)
## df AIC
## fit1 6 241.6429
## fit2 4 237.6565
8.13. Backward stepwise selection
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data = states)
stepAIC(fit, direction = "backward")
## Start: AIC=97.75
## Murder ~ Population + Illiteracy + Income + Frost
##
## Df Sum of Sq RSS AIC
## - Frost 1 0.021 289.19 95.753
## - Income 1 0.057 289.22 95.759
## <none> 289.17 97.749
## - Population 1 39.238 328.41 102.111
## - Illiteracy 1 144.264 433.43 115.986
##
## Step: AIC=95.75
## Murder ~ Population + Illiteracy + Income
##
## Df Sum of Sq RSS AIC
## - Income 1 0.057 289.25 93.763
## <none> 289.19 95.753
## - Population 1 43.658 332.85 100.783
## - Illiteracy 1 236.196 525.38 123.605
##
## Step: AIC=93.76
## Murder ~ Population + Illiteracy
##
## Df Sum of Sq RSS AIC
## <none> 289.25 93.763
## - Population 1 48.517 337.76 99.516
## - Illiteracy 1 299.646 588.89 127.311
##
## Call:
## lm(formula = Murder ~ Population + Illiteracy, data = states)
##
## Coefficients:
## (Intercept) Population Illiteracy
## 1.6515497 0.0002242 4.0807366
install.packages(“leaps”) library(leaps) states <- as.data.frame(state.x77[,c(“Murder”, “Population”, “Illiteracy”, “Income”, “Frost”)])
leaps <-regsubsets(Murder ~ Population + Illiteracy + Income + Frost, data=states, nbest=4) plot(leaps, scale=“adjr2”)
library(car) subsets(leaps, statistic=“cp”, main=“Cp Plot for All Subsets Regression”) abline(1,1,lty=2,col=“red”)
8.15. Function for k-fold cross-validated R-square
library(crossval)
shrinkage <- function(fit, k=10){
require(bootstrap)
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
x <- fit$model[,2:ncol(fit$model)]
y <- fit$model[,1]
results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)
r2 <- cor(y, fit$fitted.values)^2
r2cv <- cor(y, results$cv.fit)^2
cat("Original R-square =", r2, "\n")
cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change =", r2-r2cv, "\n")
}
states <- as.data.frame(state.x77[,c(“Murder”, “Population”, “Illiteracy”, “Income”, “Frost”)]) fit <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states) shrinkage(fit)
fit2 <- lm(Murder ~ Population + Illiteracy,data=states) shrinkage(fit2)
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)
## (Intercept) Population Income Illiteracy Frost
## -8.272891e-17 2.705095e-01 1.072372e-02 6.840496e-01 8.185407e-03
8.16. relweights() for calculating relative importance of predictors
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}
8.17. Applying the relweights() function
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
relweights(fit, col = "blue")
## Weights
## Income 5.488962
## Population 14.723401
## Frost 20.787442
## Illiteracy 59.000195