Linear Regression Assignment

Prasanna Reddy

library(car)
## Loading required package: carData
states <- as.data.frame(state.x77[,c('Murder', 'Population', 'Illiteracy', 'Income', 'Frost')])


#Useful functions for regression diagnostics
fit_states <- lm(Frost ~ Murder + Population + Illiteracy + Income , data=states)
# Displays detailed results for the fitted model
summary(states) 
##      Murder         Population      Illiteracy        Income    
##  Min.   : 1.400   Min.   :  365   Min.   :0.500   Min.   :3098  
##  1st Qu.: 4.350   1st Qu.: 1080   1st Qu.:0.625   1st Qu.:3993  
##  Median : 6.850   Median : 2838   Median :0.950   Median :4519  
##  Mean   : 7.378   Mean   : 4246   Mean   :1.170   Mean   :4436  
##  3rd Qu.:10.675   3rd Qu.: 4968   3rd Qu.:1.575   3rd Qu.:4814  
##  Max.   :15.100   Max.   :21198   Max.   :2.800   Max.   :6315  
##      Frost       
##  Min.   :  0.00  
##  1st Qu.: 66.25  
##  Median :114.50  
##  Mean   :104.46  
##  3rd Qu.:139.75  
##  Max.   :188.00
#Correlation model
cor(states)
##                Murder Population Illiteracy     Income      Frost
## Murder      1.0000000  0.3436428  0.7029752 -0.2300776 -0.5388834
## Population  0.3436428  1.0000000  0.1076224  0.2082276 -0.3321525
## Illiteracy  0.7029752  0.1076224  1.0000000 -0.4370752 -0.6719470
## Income     -0.2300776  0.2082276 -0.4370752  1.0000000  0.2262822
## Frost      -0.5388834 -0.3321525 -0.6719470  0.2262822  1.0000000
# Lists the model parameters (intercept and slopes) for the fitted model
coefficients(fit_states)  
##   (Intercept)        Murder    Population    Illiteracy        Income 
##  1.816758e+02  1.277942e-01 -3.087891e-03 -5.543236e+01 -4.289552e-05
#Scatter Plot martix for states
scatterplotMatrix(states)

# Provides confidence intervals for the model parameters (95 percent by default)
confint(fit_states, level=0.95) 
##                    2.5 %        97.5 %
## (Intercept)  79.77143693  2.835801e+02
## Murder       -4.32377131  4.579360e+00
## Population   -0.00581532 -3.604627e-04
## Illiteracy  -82.72554259 -2.813918e+01
## Income       -0.02046221  2.037642e-02
#Lists the predicted values in a fitted model
fitted(fit_states) 
##        Alabama         Alaska        Arizona       Arkansas     California 
##       55.87931       98.57332       75.86957       70.98464       56.33995 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##      135.68627      111.29445      130.58478       85.21607       57.18550 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##       74.25341      146.40651       98.30873      127.18325      145.22053 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##      141.75074       83.72068       16.24745      139.79270      119.91729 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##      102.96491      104.86757      136.40211       42.87387      123.61590 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##      146.56528      143.82587      153.38648      140.60382       98.49654 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##       57.27715       49.43637       66.34858      135.32358      104.93139 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##      112.96356      141.70044       90.20971      106.85005       46.81231 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##      151.89523       75.79680       23.31748      145.10415      147.49413 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##       89.70206      137.76741       99.21640      128.89450      147.94121
# Lists the residual values in a fitted model
residuals(fit_states)
##        Alabama         Alaska        Arizona       Arkansas     California 
##   -35.87930914    53.42667684   -60.86956800    -5.98463907   -36.33995241 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##    30.31372908    27.70554736   -27.58478179   -74.21606657     2.81450484 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##   -74.25341312   -20.40650698    28.69127426    -5.18324807    -5.22052535 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##   -27.75074256    11.27931942    -4.24744808    21.20729702   -18.91729094 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##     0.03508839    20.13242687    23.59789171     7.12612800   -15.61589849 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##     8.43472275    -4.82586594    34.61351517    33.39617889    16.50346183 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##    62.72285001    32.56363219    13.65141538    50.67641697    19.06861460 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##   -30.96356414   -97.70043878    35.79029293    20.14995213    18.18768984 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##    20.10477338    -5.79680390    11.68252199    -8.10415486    20.50586940 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##    -4.70205700  -105.76740615     0.78359808    20.10550396    25.05878805
#Generates an ANOVA table for a fitted model, or an ANOVA table
anova(fit_states) 
#Lists the covariance matrix for model parameters
vcov(fit_states)  
##               (Intercept)        Murder    Population    Illiteracy
## (Intercept) 2559.89216755 -6.547191e+00  1.424242e-02 -2.927956e+02
## Murder        -6.54719086  4.884971e+00 -1.083960e-03 -2.008172e+01
## Population     0.01424242 -1.083960e-03  1.833763e-06  1.590723e-03
## Illiteracy  -292.79556285 -2.008172e+01  1.590723e-03  1.836305e+02
## Income        -0.49624462 -3.146144e-04 -3.582902e-06  4.945127e-02
##                    Income
## (Intercept) -4.962446e-01
## Murder      -3.146144e-04
## Population  -3.582902e-06
## Illiteracy   4.945127e-02
## Income       1.027824e-04
# Prints Akaike’s Information Criterion
AIC(fit_states)  
## [1] 511.288
BIC(fit_states)
## [1] 522.7601
# Generates diagnostic plots for evaluating the fit of a model
plot(states$Murder, states$Population) 
abline(fit_states)
## Warning in abline(fit_states): only using the first two of 5 regression
## coefficients

# Uses a fitted model to predict response values for a new dataset
predict(fit_states) 
##        Alabama         Alaska        Arizona       Arkansas     California 
##       55.87931       98.57332       75.86957       70.98464       56.33995 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##      135.68627      111.29445      130.58478       85.21607       57.18550 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##       74.25341      146.40651       98.30873      127.18325      145.22053 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##      141.75074       83.72068       16.24745      139.79270      119.91729 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##      102.96491      104.86757      136.40211       42.87387      123.61590 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##      146.56528      143.82587      153.38648      140.60382       98.49654 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##       57.27715       49.43637       66.34858      135.32358      104.93139 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##      112.96356      141.70044       90.20971      106.85005       46.81231 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##      151.89523       75.79680       23.31748      145.10415      147.49413 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##       89.70206      137.76741       99.21640      128.89450      147.94121
#layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
par(mfrow=c(3,3))
plot(fit_states)

fit_mod1 <- lm(Frost ~ Murder + Population + Illiteracy + Income , data=states)

fit_mod2 <- lm(Frost ~ Murder + Population, data=states)
summary(fit_mod1)
## 
## Call:
## lm(formula = Frost ~ Murder + Population + Illiteracy + Income, 
##     data = states)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -105.77  -13.74    7.78   21.03   62.72 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.817e+02  5.060e+01   3.591 0.000811 ***
## Murder       1.278e-01  2.210e+00   0.058 0.954148    
## Population  -3.088e-03  1.354e-03  -2.280 0.027379 *  
## Illiteracy  -5.543e+01  1.355e+01  -4.091 0.000176 ***
## Income      -4.290e-05  1.014e-02  -0.004 0.996643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.59 on 45 degrees of freedom
## Multiple R-squared:  0.5199, Adjusted R-squared:  0.4772 
## F-statistic: 12.18 on 4 and 45 DF,  p-value: 8.602e-07
summary(fit_mod2)
## 
## Call:
## lm(formula = Frost ~ Murder + Population, data = states)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -119.004  -20.354    2.429   27.033  104.399 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 162.734563  14.108519  11.534 2.62e-15 ***
## Murder       -6.781669   1.810322  -3.746  0.00049 ***
## Population   -0.001940   0.001497  -1.296  0.20123    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.93 on 47 degrees of freedom
## Multiple R-squared:  0.3149, Adjusted R-squared:  0.2857 
## F-statistic:  10.8 on 2 and 47 DF,  p-value: 0.0001382
confint(fit_mod1)
##                    2.5 %        97.5 %
## (Intercept)  79.77143693  2.835801e+02
## Murder       -4.32377131  4.579360e+00
## Population   -0.00581532 -3.604627e-04
## Illiteracy  -82.72554259 -2.813918e+01
## Income       -0.02046221  2.037642e-02
confint(fit_mod2)
##                     2.5 %        97.5 %
## (Intercept) 134.351883304 191.117242184
## Murder      -10.423566340  -3.139771155
## Population   -0.004951683   0.001071045
anova(fit_mod1, fit_mod2)
fit_mod3 <- lm(Population ~ Murder + I(Population^2), data=states)
summary(fit_mod3)
## 
## Call:
## lm(formula = Population ~ Murder + I(Population^2), data = states)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3778.0 -1027.2   -62.2  1095.7  2659.1 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.523e+03  4.880e+02   3.120  0.00309 ** 
## Murder          1.171e+02  6.152e+01   1.904  0.06308 .  
## I(Population^2) 4.951e-05  2.764e-06  17.911  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1530 on 47 degrees of freedom
## Multiple R-squared:  0.8873, Adjusted R-squared:  0.8825 
## F-statistic:   185 on 2 and 47 DF,  p-value: < 2.2e-16
#NORMALITY
qqPlot(fit_states, labels=row.names(states), id.method='identify',simulate=TRUE, main='Q-Q Plot')

##     Oregon Washington 
##         37         47
#Checking for large positive residual
states["Nevada",]
fitted(fit_states)["Nevada"]
##   Nevada 
## 153.3865
residuals(fit_states)["Nevada"]
##   Nevada 
## 34.61352
rstudent(fit_states)["Nevada"]
##   Nevada 
## 1.085276
#Function for plotting studentized residuals
residplot <- function(fit, nbreaks=10) {
  z <- rstudent(fit)
  hist(z, breaks=nbreaks, freq=FALSE,
       xlab="Studentized Residual",
       main="Distribution of Errors")
  rug(jitter(z), col="brown")
  curve(dnorm(x, mean=mean(z), sd=sd(z)),
        add=TRUE, col="blue", lwd=2)
  lines(density(z)$x, density(z)$y,
        col="red", lwd=2, lty=2)
  legend("topright",
         legend = c( "Normal Curve", "Kernel Density Curve"),
         lty=1:2, col=c("blue","red"), cex=.7)
}

residplot(fit_states)

#INDEPENDENCE OF ERRORS
durbinWatsonTest(fit_states)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.06261882      1.844634   0.612
##  Alternative hypothesis: rho != 0
#LINEARITY
crPlots(fit_states)

#Assessing homoscedasticity
ncvTest(fit_states)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4170144, Df = 1, p = 0.51843
spreadLevelPlot(fit_states)

## 
## Suggested power transformation:  0.6571042
#Global validation of linear model assumption
library(gvlma)
gvmodel <- gvlma(fit_states)
summary(gvmodel)
## 
## Call:
## lm(formula = Frost ~ Murder + Population + Illiteracy + Income, 
##     data = states)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -105.77  -13.74    7.78   21.03   62.72 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.817e+02  5.060e+01   3.591 0.000811 ***
## Murder       1.278e-01  2.210e+00   0.058 0.954148    
## Population  -3.088e-03  1.354e-03  -2.280 0.027379 *  
## Illiteracy  -5.543e+01  1.355e+01  -4.091 0.000176 ***
## Income      -4.290e-05  1.014e-02  -0.004 0.996643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.59 on 45 degrees of freedom
## Multiple R-squared:  0.5199, Adjusted R-squared:  0.4772 
## F-statistic: 12.18 on 4 and 45 DF,  p-value: 8.602e-07
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit_states) 
## 
##                       Value  p-value                   Decision
## Global Stat        12.95609 0.011492 Assumptions NOT satisfied!
## Skewness            9.98684 0.001577 Assumptions NOT satisfied!
## Kurtosis            2.81925 0.093140    Assumptions acceptable.
## Link Function       0.11388 0.735769    Assumptions acceptable.
## Heteroscedasticity  0.03612 0.849266    Assumptions acceptable.
#Evaluating multicollinearity
vif(fit_states)
##     Murder Population Illiteracy     Income 
##   2.309032   1.267769   2.366422   1.346087
sqrt(vif(fit_states)) > 2
##     Murder Population Illiteracy     Income 
##      FALSE      FALSE      FALSE      FALSE

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.