library(car)
## Loading required package: carData
states <- as.data.frame(state.x77[,c('Murder', 'Population', 'Illiteracy', 'Income', 'Frost')])
#Useful functions for regression diagnostics
fit_states <- lm(Frost ~ Murder + Population + Illiteracy + Income , data=states)
# Displays detailed results for the fitted model
summary(states)
## Murder Population Illiteracy Income
## Min. : 1.400 Min. : 365 Min. :0.500 Min. :3098
## 1st Qu.: 4.350 1st Qu.: 1080 1st Qu.:0.625 1st Qu.:3993
## Median : 6.850 Median : 2838 Median :0.950 Median :4519
## Mean : 7.378 Mean : 4246 Mean :1.170 Mean :4436
## 3rd Qu.:10.675 3rd Qu.: 4968 3rd Qu.:1.575 3rd Qu.:4814
## Max. :15.100 Max. :21198 Max. :2.800 Max. :6315
## Frost
## Min. : 0.00
## 1st Qu.: 66.25
## Median :114.50
## Mean :104.46
## 3rd Qu.:139.75
## Max. :188.00
#Correlation model
cor(states)
## Murder Population Illiteracy Income Frost
## Murder 1.0000000 0.3436428 0.7029752 -0.2300776 -0.5388834
## Population 0.3436428 1.0000000 0.1076224 0.2082276 -0.3321525
## Illiteracy 0.7029752 0.1076224 1.0000000 -0.4370752 -0.6719470
## Income -0.2300776 0.2082276 -0.4370752 1.0000000 0.2262822
## Frost -0.5388834 -0.3321525 -0.6719470 0.2262822 1.0000000
# Lists the model parameters (intercept and slopes) for the fitted model
coefficients(fit_states)
## (Intercept) Murder Population Illiteracy Income
## 1.816758e+02 1.277942e-01 -3.087891e-03 -5.543236e+01 -4.289552e-05
#Scatter Plot martix for states
scatterplotMatrix(states)
# Provides confidence intervals for the model parameters (95 percent by default)
confint(fit_states, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 79.77143693 2.835801e+02
## Murder -4.32377131 4.579360e+00
## Population -0.00581532 -3.604627e-04
## Illiteracy -82.72554259 -2.813918e+01
## Income -0.02046221 2.037642e-02
#Lists the predicted values in a fitted model
fitted(fit_states)
## Alabama Alaska Arizona Arkansas California
## 55.87931 98.57332 75.86957 70.98464 56.33995
## Colorado Connecticut Delaware Florida Georgia
## 135.68627 111.29445 130.58478 85.21607 57.18550
## Hawaii Idaho Illinois Indiana Iowa
## 74.25341 146.40651 98.30873 127.18325 145.22053
## Kansas Kentucky Louisiana Maine Maryland
## 141.75074 83.72068 16.24745 139.79270 119.91729
## Massachusetts Michigan Minnesota Mississippi Missouri
## 102.96491 104.86757 136.40211 42.87387 123.61590
## Montana Nebraska Nevada New Hampshire New Jersey
## 146.56528 143.82587 153.38648 140.60382 98.49654
## New Mexico New York North Carolina North Dakota Ohio
## 57.27715 49.43637 66.34858 135.32358 104.93139
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 112.96356 141.70044 90.20971 106.85005 46.81231
## South Dakota Tennessee Texas Utah Vermont
## 151.89523 75.79680 23.31748 145.10415 147.49413
## Virginia Washington West Virginia Wisconsin Wyoming
## 89.70206 137.76741 99.21640 128.89450 147.94121
# Lists the residual values in a fitted model
residuals(fit_states)
## Alabama Alaska Arizona Arkansas California
## -35.87930914 53.42667684 -60.86956800 -5.98463907 -36.33995241
## Colorado Connecticut Delaware Florida Georgia
## 30.31372908 27.70554736 -27.58478179 -74.21606657 2.81450484
## Hawaii Idaho Illinois Indiana Iowa
## -74.25341312 -20.40650698 28.69127426 -5.18324807 -5.22052535
## Kansas Kentucky Louisiana Maine Maryland
## -27.75074256 11.27931942 -4.24744808 21.20729702 -18.91729094
## Massachusetts Michigan Minnesota Mississippi Missouri
## 0.03508839 20.13242687 23.59789171 7.12612800 -15.61589849
## Montana Nebraska Nevada New Hampshire New Jersey
## 8.43472275 -4.82586594 34.61351517 33.39617889 16.50346183
## New Mexico New York North Carolina North Dakota Ohio
## 62.72285001 32.56363219 13.65141538 50.67641697 19.06861460
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## -30.96356414 -97.70043878 35.79029293 20.14995213 18.18768984
## South Dakota Tennessee Texas Utah Vermont
## 20.10477338 -5.79680390 11.68252199 -8.10415486 20.50586940
## Virginia Washington West Virginia Wisconsin Wyoming
## -4.70205700 -105.76740615 0.78359808 20.10550396 25.05878805
#Generates an ANOVA table for a fitted model, or an ANOVA table
anova(fit_states)
#Lists the covariance matrix for model parameters
vcov(fit_states)
## (Intercept) Murder Population Illiteracy
## (Intercept) 2559.89216755 -6.547191e+00 1.424242e-02 -2.927956e+02
## Murder -6.54719086 4.884971e+00 -1.083960e-03 -2.008172e+01
## Population 0.01424242 -1.083960e-03 1.833763e-06 1.590723e-03
## Illiteracy -292.79556285 -2.008172e+01 1.590723e-03 1.836305e+02
## Income -0.49624462 -3.146144e-04 -3.582902e-06 4.945127e-02
## Income
## (Intercept) -4.962446e-01
## Murder -3.146144e-04
## Population -3.582902e-06
## Illiteracy 4.945127e-02
## Income 1.027824e-04
# Prints Akaike’s Information Criterion
AIC(fit_states)
## [1] 511.288
BIC(fit_states)
## [1] 522.7601
# Generates diagnostic plots for evaluating the fit of a model
plot(states$Murder, states$Population)
abline(fit_states)
## Warning in abline(fit_states): only using the first two of 5 regression
## coefficients
# Uses a fitted model to predict response values for a new dataset
predict(fit_states)
## Alabama Alaska Arizona Arkansas California
## 55.87931 98.57332 75.86957 70.98464 56.33995
## Colorado Connecticut Delaware Florida Georgia
## 135.68627 111.29445 130.58478 85.21607 57.18550
## Hawaii Idaho Illinois Indiana Iowa
## 74.25341 146.40651 98.30873 127.18325 145.22053
## Kansas Kentucky Louisiana Maine Maryland
## 141.75074 83.72068 16.24745 139.79270 119.91729
## Massachusetts Michigan Minnesota Mississippi Missouri
## 102.96491 104.86757 136.40211 42.87387 123.61590
## Montana Nebraska Nevada New Hampshire New Jersey
## 146.56528 143.82587 153.38648 140.60382 98.49654
## New Mexico New York North Carolina North Dakota Ohio
## 57.27715 49.43637 66.34858 135.32358 104.93139
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 112.96356 141.70044 90.20971 106.85005 46.81231
## South Dakota Tennessee Texas Utah Vermont
## 151.89523 75.79680 23.31748 145.10415 147.49413
## Virginia Washington West Virginia Wisconsin Wyoming
## 89.70206 137.76741 99.21640 128.89450 147.94121
#layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
par(mfrow=c(3,3))
plot(fit_states)
fit_mod1 <- lm(Frost ~ Murder + Population + Illiteracy + Income , data=states)
fit_mod2 <- lm(Frost ~ Murder + Population, data=states)
summary(fit_mod1)
##
## Call:
## lm(formula = Frost ~ Murder + Population + Illiteracy + Income,
## data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -105.77 -13.74 7.78 21.03 62.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.817e+02 5.060e+01 3.591 0.000811 ***
## Murder 1.278e-01 2.210e+00 0.058 0.954148
## Population -3.088e-03 1.354e-03 -2.280 0.027379 *
## Illiteracy -5.543e+01 1.355e+01 -4.091 0.000176 ***
## Income -4.290e-05 1.014e-02 -0.004 0.996643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.59 on 45 degrees of freedom
## Multiple R-squared: 0.5199, Adjusted R-squared: 0.4772
## F-statistic: 12.18 on 4 and 45 DF, p-value: 8.602e-07
summary(fit_mod2)
##
## Call:
## lm(formula = Frost ~ Murder + Population, data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -119.004 -20.354 2.429 27.033 104.399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 162.734563 14.108519 11.534 2.62e-15 ***
## Murder -6.781669 1.810322 -3.746 0.00049 ***
## Population -0.001940 0.001497 -1.296 0.20123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.93 on 47 degrees of freedom
## Multiple R-squared: 0.3149, Adjusted R-squared: 0.2857
## F-statistic: 10.8 on 2 and 47 DF, p-value: 0.0001382
confint(fit_mod1)
## 2.5 % 97.5 %
## (Intercept) 79.77143693 2.835801e+02
## Murder -4.32377131 4.579360e+00
## Population -0.00581532 -3.604627e-04
## Illiteracy -82.72554259 -2.813918e+01
## Income -0.02046221 2.037642e-02
confint(fit_mod2)
## 2.5 % 97.5 %
## (Intercept) 134.351883304 191.117242184
## Murder -10.423566340 -3.139771155
## Population -0.004951683 0.001071045
anova(fit_mod1, fit_mod2)
fit_mod3 <- lm(Population ~ Murder + I(Population^2), data=states)
summary(fit_mod3)
##
## Call:
## lm(formula = Population ~ Murder + I(Population^2), data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3778.0 -1027.2 -62.2 1095.7 2659.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.523e+03 4.880e+02 3.120 0.00309 **
## Murder 1.171e+02 6.152e+01 1.904 0.06308 .
## I(Population^2) 4.951e-05 2.764e-06 17.911 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1530 on 47 degrees of freedom
## Multiple R-squared: 0.8873, Adjusted R-squared: 0.8825
## F-statistic: 185 on 2 and 47 DF, p-value: < 2.2e-16
#NORMALITY
qqPlot(fit_states, labels=row.names(states), id.method='identify',simulate=TRUE, main='Q-Q Plot')
## Oregon Washington
## 37 47
#Checking for large positive residual
states["Nevada",]
fitted(fit_states)["Nevada"]
## Nevada
## 153.3865
residuals(fit_states)["Nevada"]
## Nevada
## 34.61352
rstudent(fit_states)["Nevada"]
## Nevada
## 1.085276
#Function for plotting studentized residuals
residplot <- function(fit, nbreaks=10) {
z <- rstudent(fit)
hist(z, breaks=nbreaks, freq=FALSE,
xlab="Studentized Residual",
main="Distribution of Errors")
rug(jitter(z), col="brown")
curve(dnorm(x, mean=mean(z), sd=sd(z)),
add=TRUE, col="blue", lwd=2)
lines(density(z)$x, density(z)$y,
col="red", lwd=2, lty=2)
legend("topright",
legend = c( "Normal Curve", "Kernel Density Curve"),
lty=1:2, col=c("blue","red"), cex=.7)
}
residplot(fit_states)
#INDEPENDENCE OF ERRORS
durbinWatsonTest(fit_states)
## lag Autocorrelation D-W Statistic p-value
## 1 0.06261882 1.844634 0.612
## Alternative hypothesis: rho != 0
#LINEARITY
crPlots(fit_states)
#Assessing homoscedasticity
ncvTest(fit_states)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.4170144, Df = 1, p = 0.51843
spreadLevelPlot(fit_states)
##
## Suggested power transformation: 0.6571042
#Global validation of linear model assumption
library(gvlma)
gvmodel <- gvlma(fit_states)
summary(gvmodel)
##
## Call:
## lm(formula = Frost ~ Murder + Population + Illiteracy + Income,
## data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -105.77 -13.74 7.78 21.03 62.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.817e+02 5.060e+01 3.591 0.000811 ***
## Murder 1.278e-01 2.210e+00 0.058 0.954148
## Population -3.088e-03 1.354e-03 -2.280 0.027379 *
## Illiteracy -5.543e+01 1.355e+01 -4.091 0.000176 ***
## Income -4.290e-05 1.014e-02 -0.004 0.996643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.59 on 45 degrees of freedom
## Multiple R-squared: 0.5199, Adjusted R-squared: 0.4772
## F-statistic: 12.18 on 4 and 45 DF, p-value: 8.602e-07
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fit_states)
##
## Value p-value Decision
## Global Stat 12.95609 0.011492 Assumptions NOT satisfied!
## Skewness 9.98684 0.001577 Assumptions NOT satisfied!
## Kurtosis 2.81925 0.093140 Assumptions acceptable.
## Link Function 0.11388 0.735769 Assumptions acceptable.
## Heteroscedasticity 0.03612 0.849266 Assumptions acceptable.
#Evaluating multicollinearity
vif(fit_states)
## Murder Population Illiteracy Income
## 2.309032 1.267769 2.366422 1.346087
sqrt(vif(fit_states)) > 2
## Murder Population Illiteracy Income
## FALSE FALSE FALSE FALSE
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.